diff options
Diffstat (limited to 'fs')
1135 files changed, 139045 insertions, 26725 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index cebba4eaa0b5..12c0ae29f185 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &path, sizeof(path), &version, sizeof(version), i_size_read(&v9inode->netfs.inode)); + if (v9inode->netfs.cache) + mapping_set_release_always(inode->i_mapping); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index d525957594b6..61dbe52bb3a3 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -732,4 +732,5 @@ module_exit(exit_v9fs) MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>"); MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>"); +MODULE_DESCRIPTION("9P Client File System"); MODULE_LICENSE("GPL"); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index cdf441f22e07..731e3d14b67d 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -52,7 +52,6 @@ void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); -void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 950cf61f118b..b845ee18a80b 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mapping->a_ops = &v9fs_addr_operations; inode->i_private = NULL; @@ -1011,7 +1011,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache & CACHE_WRITEBACK) { if (S_ISREG(inode->i_mode)) { @@ -1032,7 +1032,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1150,9 +1150,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, set_nlink(inode, 1); - inode->i_atime.tv_sec = stat->atime; - inode->i_mtime.tv_sec = stat->mtime; - inode->i_ctime.tv_sec = stat->mtime; + inode_set_atime(inode, stat->atime, 0); + inode_set_mtime(inode, stat->mtime, 0); + inode_set_ctime(inode, stat->mtime, 0); inode->i_uid = v9ses->dfltuid; inode->i_gid = v9ses->dfltgid; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 14510872ecc3..c7319af2f471 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -450,7 +450,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache) { if (S_ISREG(inode->i_mode)) { @@ -475,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -641,12 +641,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { - inode->i_atime.tv_sec = stat->st_atime_sec; - inode->i_atime.tv_nsec = stat->st_atime_nsec; - inode->i_mtime.tv_sec = stat->st_mtime_sec; - inode->i_mtime.tv_nsec = stat->st_mtime_nsec; - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode_set_atime(inode, stat->st_atime_sec, + stat->st_atime_nsec); + inode_set_mtime(inode, stat->st_mtime_sec, + stat->st_mtime_nsec); + inode_set_ctime(inode, stat->st_ctime_sec, + stat->st_ctime_nsec); inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; set_nlink(inode, stat->st_nlink); @@ -660,16 +660,16 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { - inode->i_atime.tv_sec = stat->st_atime_sec; - inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode_set_atime(inode, stat->st_atime_sec, + stat->st_atime_nsec); } if (stat->st_result_mask & P9_STATS_MTIME) { - inode->i_mtime.tv_sec = stat->st_mtime_sec; - inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode_set_mtime(inode, stat->st_mtime_sec, + stat->st_mtime_nsec); } if (stat->st_result_mask & P9_STATS_CTIME) { - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode_set_ctime(inode, stat->st_ctime_sec, + stat->st_ctime_nsec); } if (stat->st_result_mask & P9_STATS_UID) inode->i_uid = stat->st_uid; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index e00cf8109b3f..8604e3377ee7 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -68,7 +68,7 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, struct p9_fid *fid; int ret; - p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n", + p9_debug(P9_DEBUG_VFS, "name = '%s' value_len = %zu\n", name, buffer_size); fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) @@ -139,7 +139,8 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - return v9fs_xattr_get(dentry, NULL, buffer, buffer_size); + /* Txattrwalk with an empty string lists xattrs instead */ + return v9fs_xattr_get(dentry, "", buffer, buffer_size); } static int v9fs_xattr_handler_get(const struct xattr_handler *handler, @@ -162,27 +163,27 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler, return v9fs_xattr_set(dentry, full_name, value, size, flags); } -static struct xattr_handler v9fs_xattr_user_handler = { +static const struct xattr_handler v9fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = v9fs_xattr_handler_get, .set = v9fs_xattr_handler_set, }; -static struct xattr_handler v9fs_xattr_trusted_handler = { +static const struct xattr_handler v9fs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = v9fs_xattr_handler_get, .set = v9fs_xattr_handler_set, }; #ifdef CONFIG_9P_FS_SECURITY -static struct xattr_handler v9fs_xattr_security_handler = { +static const struct xattr_handler v9fs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = v9fs_xattr_handler_get, .set = v9fs_xattr_handler_set, }; #endif -const struct xattr_handler *v9fs_xattr_handlers[] = { +const struct xattr_handler * const v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, #ifdef CONFIG_9P_FS_SECURITY diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index b5636e544c8a..3ad5a802352a 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -10,7 +10,7 @@ #include <net/9p/9p.h> #include <net/9p/client.h> -extern const struct xattr_handler *v9fs_xattr_handlers[]; +extern const struct xattr_handler * const v9fs_xattr_handlers[]; ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, void *buffer, size_t buffer_size); diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7953..42837617a55b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER config FS_IOMAP bool +config BUFFER_HEAD + bool + # old blockdev_direct_IO implementation. Use iomap for new code instead config LEGACY_DIRECT_IO + depends on BUFFER_HEAD bool if BLOCK @@ -44,6 +48,7 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" +source "fs/bcachefs/Kconfig" source "fs/zonefs/Kconfig" endif # BLOCK @@ -169,6 +174,7 @@ source "fs/sysfs/Kconfig" config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM + select MEMFD_CREATE help Tmpfs is a file system which keeps all files in virtual memory. @@ -205,8 +211,8 @@ config TMPFS_XATTR Extended attributes are name:value pairs associated with inodes by the kernel or by users (see the attr(5) manual page for details). - Currently this enables support for the trusted.* and - security.* namespaces. + This enables support for the trusted.*, security.* and user.* + namespaces. You need this for POSIX ACL support on tmpfs. @@ -233,13 +239,26 @@ config TMPFS_INODE64 If unsure, say N. +config TMPFS_QUOTA + bool "Tmpfs quota support" + depends on TMPFS + select QUOTA + help + Quota support allows to set per user and group limits for tmpfs + usage. Say Y to enable quota support. Once enabled you can control + user and group quota enforcement with quota, usrquota and grpquota + mount options. + + If unsure, say N. + config ARCH_SUPPORTS_HUGETLBFS def_bool n config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) + select MEMFD_CREATE help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -249,10 +268,11 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS + select XARRAY_MULTI config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON @@ -264,9 +284,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off (boot command line) or hugetlb_optimize_vmemmap (sysctl). -config MEMFD_CREATE - def_bool TMPFS || HUGETLBFS - config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 93539aac0e5b..f5693164ca9a 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU) + depends on ARM || ((M68K || RISCV || SUPERH || XTENSA) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load diff --git a/fs/Makefile b/fs/Makefile index e513aaee0603..75522f88e763 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o -obj-$(CONFIG_BLOCK) += buffer.o mpage.o +obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ @@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ +obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index 44738fed6625..1b97058f0c4a 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -2,6 +2,7 @@ config ADFS_FS tristate "ADFS file system support" depends on BLOCK + select BUFFER_HEAD help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h index a5393e6cf9f4..4e6c53d59ebd 100644 --- a/fs/adfs/dir_f.h +++ b/fs/adfs/dir_f.h @@ -58,9 +58,4 @@ struct adfs_newdirtail { __u8 dircheckbyte; } __attribute__((packed)); -union adfs_dirtail { - struct adfs_olddirtail old; - struct adfs_newdirtail new; -}; - #endif diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index c3ac613d0975..3081edb09e46 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -242,6 +242,7 @@ struct inode * adfs_iget(struct super_block *sb, struct object_info *obj) { struct inode *inode; + struct timespec64 ts; inode = new_inode(sb); if (!inode) @@ -268,9 +269,10 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->attr = obj->attr; inode->i_mode = adfs_atts2mode(sb, inode); - adfs_adfs2unix_time(&inode->i_mtime, inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; + adfs_adfs2unix_time(&ts, inode); + inode_set_atime_to_ts(inode, ts); + inode_set_mtime_to_ts(inode, ts); + inode_set_ctime_to_ts(inode, ts); if (S_ISDIR(inode->i_mode)) { inode->i_op = &adfs_dir_inode_operations; @@ -321,7 +323,8 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) { adfs_unix2adfs_time(inode, &attr->ia_mtime); - adfs_adfs2unix_time(&inode->i_mtime, inode); + adfs_adfs2unix_time(&attr->ia_mtime, inode); + inode_set_mtime_to_ts(inode, attr->ia_mtime); } /* @@ -329,9 +332,9 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, * have the ability to represent them in our filesystem? */ if (ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode); inode->i_mode = adfs_atts2mode(sb, inode); diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 962b86374e1c..1ae432d266c3 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -2,6 +2,7 @@ config AFFS_FS tristate "Amiga FFS file system support" depends on BLOCK + select BUFFER_HEAD select LEGACY_DIRECT_IO help The Fast File System (FFS) is the common file system used on hard diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 29f11e10a7c7..fd669daa4e7b 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) mark_buffer_dirty_inode(dir_bh, dir); affs_brelse(dir_bh); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); mark_inode_dirty(dir); @@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) affs_brelse(bh); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); mark_inode_dirty(dir); @@ -315,7 +315,7 @@ affs_remove_header(struct dentry *dentry) else clear_nlink(inode); affs_unlock_link(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); done: diff --git a/fs/affs/file.c b/fs/affs/file.c index e43f2f007ac1..04c018e19602 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -15,6 +15,7 @@ #include <linux/uio.h> #include <linux/blkdev.h> +#include <linux/mpage.h> #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); @@ -370,9 +371,10 @@ err_alloc: return -ENOSPC; } -static int affs_writepage(struct page *page, struct writeback_control *wbc) +static int affs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, affs_get_block, wbc); + return mpage_writepages(mapping, wbc, affs_get_block); } static int affs_read_folio(struct file *file, struct folio *folio) @@ -456,10 +458,11 @@ const struct address_space_operations affs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = affs_read_folio, - .writepage = affs_writepage, + .writepages = affs_writepages, .write_begin = affs_write_begin, .write_end = affs_write_end, .direct_IO = affs_direct_IO, + .migrate_folio = buffer_migrate_folio, .bmap = _affs_bmap }; @@ -520,21 +523,20 @@ affs_getemptyblk_ino(struct inode *inode, int block) return ERR_PTR(err); } -static int -affs_do_readpage_ofs(struct page *page, unsigned to, int create) +static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; - unsigned pos = 0; - u32 bidx, boff, bsize; + size_t pos = 0; + size_t bidx, boff, bsize; u32 tmp; - pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, - page->index, to); - BUG_ON(to > PAGE_SIZE); + pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino, + folio->index, to); + BUG_ON(to > folio_size(folio)); bsize = AFFS_SB(sb)->s_data_blksize; - tmp = page->index << PAGE_SHIFT; + tmp = folio_pos(folio); bidx = tmp / bsize; boff = tmp % bsize; @@ -544,7 +546,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); - memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp); + memcpy_to_folio(folio, pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; pos += tmp; @@ -624,25 +626,23 @@ out: return PTR_ERR(bh); } -static int -affs_read_folio_ofs(struct file *file, struct folio *folio) +static int affs_read_folio_ofs(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; - u32 to; + struct inode *inode = folio->mapping->host; + size_t to; int err; - pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); - to = PAGE_SIZE; - if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) { - to = inode->i_size & ~PAGE_MASK; - memset(page_address(page) + to, 0, PAGE_SIZE - to); + pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index); + to = folio_size(folio); + if (folio_pos(folio) + to > inode->i_size) { + to = inode->i_size - folio_pos(folio); + folio_zero_segment(folio, to, folio_size(folio)); } - err = affs_do_readpage_ofs(page, to, 0); + err = affs_do_read_folio_ofs(folio, to, 0); if (!err) - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return err; } @@ -651,7 +651,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - struct page *page; + struct folio *folio; pgoff_t index; int err = 0; @@ -667,19 +667,20 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping } index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *pagep = &folio->page; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_SIZE, 1); + err = affs_do_read_folio_ofs(folio, folio_size(folio), 1); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return err; } @@ -688,6 +689,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; @@ -701,18 +703,18 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, to = from + len; /* * XXX: not sure if this can handle short copies (len < copied), but - * we don't have to, because the page should always be uptodate here, + * we don't have to, because the folio should always be uptodate here, * due to write_begin. */ pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); bsize = AFFS_SB(sb)->s_data_blksize; - data = page_address(page); + data = folio_address(folio); bh = NULL; written = 0; - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; bidx = tmp / bsize; boff = tmp % bsize; if (boff) { @@ -804,11 +806,11 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, from += tmp; bidx++; } - SetPageUptodate(page); + folio_mark_uptodate(folio); done: affs_brelse(bh); - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; @@ -819,8 +821,8 @@ done: } err_first_bh: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return written; @@ -835,9 +837,10 @@ const struct address_space_operations affs_aops_ofs = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = affs_read_folio_ofs, - //.writepage = affs_writepage_ofs, + //.writepages = affs_writepages_ofs, .write_begin = affs_write_begin_ofs, - .write_end = affs_write_end_ofs + .write_end = affs_write_end_ofs, + .migrate_folio = filemap_migrate_folio, }; /* Free any preallocated blocks. */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 27f77a52c5c8..0210df8d3500 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -149,13 +149,9 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) break; } - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec - = (be32_to_cpu(tail->change.days) * 86400LL + - be32_to_cpu(tail->change.mins) * 60 + - be32_to_cpu(tail->change.ticks) / 50 + - AFFS_EPOCH_DELTA) + - sys_tz.tz_minuteswest * 60; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0; + inode_set_mtime(inode, + inode_set_atime(inode, inode_set_ctime(inode, (be32_to_cpu(tail->change.days) * 86400LL + be32_to_cpu(tail->change.mins) * 60 + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + sys_tz.tz_minuteswest * 60, 0).tv_sec, 0).tv_sec, + 0); affs_brelse(bh); unlock_new_inode(inode); return inode; @@ -187,12 +183,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) } tail = AFFS_TAIL(sb, bh); if (tail->stype == cpu_to_be32(ST_ROOT)) { - affs_secs_to_datestamp(inode->i_mtime.tv_sec, + affs_secs_to_datestamp(inode_get_mtime_sec(inode), &AFFS_ROOT_TAIL(sb, bh)->root_change); } else { tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect); tail->size = cpu_to_be32(inode->i_size); - affs_secs_to_datestamp(inode->i_mtime.tv_sec, &tail->change); + affs_secs_to_datestamp(inode_get_mtime_sec(inode), + &tail->change); if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { uid = i_uid_read(inode); gid = i_gid_read(inode); @@ -314,7 +311,7 @@ affs_new_inode(struct inode *dir) inode->i_gid = current_fsgid(); inode->i_ino = block; set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; AFFS_I(inode)->i_lc = NULL; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d12ccfd2a83d..d6b9758ee23d 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -43,7 +43,7 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate) +__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t fn, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; @@ -57,7 +57,7 @@ __affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t tou hash = init_name_hash(dentry); len = min(qstr->len, AFFSNAMEMAX); for (; len > 0; name++, len--) - hash = partial_name_hash(toupper(*name), hash); + hash = partial_name_hash(fn(*name), hash); qstr->hash = end_name_hash(hash); return 0; @@ -80,7 +80,7 @@ affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper, + const char *str, const struct qstr *name, toupper_t fn, bool notruncate) { const u8 *aname = str; @@ -106,7 +106,7 @@ static inline int __affs_compare_dentry(unsigned int len, return 1; for (; len > 0; len--) - if (toupper(*aname++) != toupper(*bname++)) + if (fn(*aname++) != fn(*bname++)) return 1; return 0; @@ -135,7 +135,7 @@ affs_intl_compare_dentry(const struct dentry *dentry, */ static inline int -affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) +affs_match(struct dentry *dentry, const u8 *name2, toupper_t fn) { const u8 *name = dentry->d_name.name; int len = dentry->d_name.len; @@ -148,7 +148,7 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) return 0; for (name2++; len > 0; len--) - if (toupper(*name++) != toupper(*name2++)) + if (fn(*name++) != fn(*name2++)) return 0; return 1; } @@ -156,12 +156,12 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) { - toupper_t toupper = affs_get_toupper(sb); + toupper_t fn = affs_get_toupper(sb); u32 hash; hash = len = min(len, AFFSNAMEMAX); for (; len > 0; len--) - hash = (hash * 13 + toupper(*name++)) & 0x7ff; + hash = (hash * 13 + fn(*name++)) & 0x7ff; return hash % AFFS_SB(sb)->s_hashsize; } @@ -171,7 +171,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; - toupper_t toupper = affs_get_toupper(sb); + toupper_t fn = affs_get_toupper(sb); u32 key; pr_debug("%s(\"%pd\")\n", __func__, dentry); @@ -189,7 +189,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) bh = affs_bread(sb, key); if (!bh) return ERR_PTR(-EIO); - if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper)) + if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, fn)) return bh; key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); } @@ -568,6 +568,7 @@ static struct dentry *affs_fh_to_parent(struct super_block *sb, struct fid *fid, } const struct export_operations affs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = affs_fh_to_dentry, .fh_to_parent = affs_fh_to_parent, .get_parent = affs_get_parent, diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 31d6446dc166..094aec8d17b8 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -13,10 +13,9 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct buffer_head *bh; - struct inode *inode = page->mapping->host; - char *link = page_address(page); + struct inode *inode = folio->mapping->host; + char *link = folio_address(folio); struct slink_front *lf; int i, j; char c; @@ -58,12 +57,11 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) } link[i] = '\0'; affs_brelse(bh); - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_unlock(folio); return -EIO; } diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 988c2ac7cece..926cb1188eba 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -409,10 +409,12 @@ static int afs_update_cell(struct afs_cell *cell) if (ret == -ENOMEM) goto out_wake; - ret = -ENOMEM; vllist = afs_alloc_vlserver_list(0); - if (!vllist) + if (!vllist) { + if (ret >= 0) + ret = -ENOMEM; goto out_wake; + } switch (ret) { case -ENODATA: diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index d7d9402ff718..1f656005018e 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) set_nlink(inode, 2); inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; inode->i_generation = 0; @@ -114,6 +114,7 @@ static int afs_probe_cell_name(struct dentry *dentry) struct afs_net *net = afs_d2net(dentry); const char *name = dentry->d_name.name; size_t len = dentry->d_name.len; + char *result = NULL; int ret; /* Names prefixed with a dot are R/W mounts. */ @@ -131,9 +132,22 @@ static int afs_probe_cell_name(struct dentry *dentry) } ret = dns_query(net->net, "afsdb", name, len, "srv=1", - NULL, NULL, false); - if (ret == -ENODATA) - ret = -EDESTADDRREQ; + &result, NULL, false); + if (ret == -ENODATA || ret == -ENOKEY || ret == 0) + ret = -ENOENT; + if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) { + struct dns_server_list_v1_header *v1 = (void *)result; + + if (v1->hdr.zero == 0 && + v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST && + v1->hdr.version == 1 && + (v1->status != DNS_LOOKUP_GOOD && + v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) + return -ENOENT; + + } + + kfree(result); return ret; } @@ -252,20 +266,9 @@ static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags) return 1; } -/* - * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't - * sleep) - * - called from dput() when d_count is going to 0. - * - return 1 to request dentry be unhashed, 0 otherwise - */ -static int afs_dynroot_d_delete(const struct dentry *dentry) -{ - return d_really_is_positive(dentry); -} - const struct dentry_operations afs_dynroot_dentry_operations = { .d_revalidate = afs_dynroot_d_revalidate, - .d_delete = afs_dynroot_d_delete, + .d_delete = always_delete_dentry, .d_release = afs_d_release, .d_automount = afs_d_automount, }; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 866bab860a88..78efc9719349 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -90,9 +90,9 @@ static int afs_inode_init_from_status(struct afs_operation *op, vnode->status = *status; t = status->mtime_client; - inode->i_ctime = t; - inode->i_mtime = t; - inode->i_atime = t; + inode_set_ctime_to_ts(inode, t); + inode_set_mtime_to_ts(inode, t); + inode_set_atime_to_ts(inode, t); inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); @@ -204,9 +204,9 @@ static void afs_apply_status(struct afs_operation *op, } t = status->mtime_client; - inode->i_mtime = t; + inode_set_mtime_to_ts(inode, t); if (vp->update_ctime) - inode->i_ctime = op->ctime; + inode_set_ctime_to_ts(inode, op->ctime); if (vnode->status.data_version != status->data_version) data_changed = true; @@ -252,8 +252,8 @@ static void afs_apply_status(struct afs_operation *op, vnode->netfs.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); - inode->i_ctime = t; - inode->i_atime = t; + inode_set_ctime_to_ts(inode, t); + inode_set_atime_to_ts(inode, t); } } } @@ -773,7 +773,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9d3d64921106..7385d62c8cf5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -87,7 +87,7 @@ struct afs_addr_list { enum dns_lookup_status status:8; unsigned long failed; /* Mask of addrs that failed locally/ICMP */ unsigned long responded; /* Mask of addrs that responded */ - struct sockaddr_rxrpc addrs[]; + struct sockaddr_rxrpc addrs[] __counted_by(max_addrs); #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) }; @@ -553,6 +553,7 @@ struct afs_server_entry { }; struct afs_server_list { + struct rcu_head rcu; afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */ refcount_t usage; unsigned char nr_servers; @@ -585,6 +586,7 @@ struct afs_volume { #define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */ #define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ #define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ +#define AFS_VOLUME_RM_TREE 7 /* - Set if volume removed from cell->volumes */ #ifdef CONFIG_AFS_FSCACHE struct fscache_volume *cache; /* Caching cookie */ #endif @@ -681,6 +683,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } @@ -703,7 +707,7 @@ struct afs_permits { refcount_t usage; unsigned short nr_permits; /* Number of records */ bool invalidated; /* Invalidated due to key change */ - struct afs_permit permits[]; /* List of permits sorted by key pointer */ + struct afs_permit permits[] __counted_by(nr_permits); /* List of permits sorted by key pointer */ }; /* @@ -1510,6 +1514,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); +bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); @@ -1539,7 +1544,7 @@ int afs_launder_folio(struct folio *); /* * xattr.c */ -extern const struct xattr_handler *afs_xattr_handlers[]; +extern const struct xattr_handler * const afs_xattr_handlers[]; /* * yfsclient.c diff --git a/fs/afs/main.c b/fs/afs/main.c index eae288c8d40a..6425c81d07de 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -41,8 +41,6 @@ const char afs_init_sysname[] = "arm_linux26"; const char afs_init_sysname[] = "aarch64_linux26"; #elif defined(CONFIG_X86_32) const char afs_init_sysname[] = "i386_linux26"; -#elif defined(CONFIG_IA64) -const char afs_init_sysname[] = "ia64_linux26"; #elif defined(CONFIG_PPC64) const char afs_init_sysname[] = "ppc64_linux26"; #elif defined(CONFIG_PPC32) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index ed1644e7683f..d642d06a453b 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -424,7 +424,7 @@ error_kill_call: if (call->async) { if (cancel_work_sync(&call->async_work)) afs_put_call(call); - afs_put_call(call); + afs_set_call_complete(call, ret, 0); } ac->error = ret; diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index ed9056703505..b59896b1de0a 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -17,7 +17,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) for (i = 0; i < slist->nr_servers; i++) afs_unuse_server(net, slist->servers[i].server, afs_server_trace_put_slist); - kfree(slist); + kfree_rcu(slist, rcu); } } diff --git a/fs/afs/super.c b/fs/afs/super.c index 95d713074dc8..a01a0fb2cdbb 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -407,6 +407,10 @@ static int afs_validate_fc(struct fs_context *fc) return PTR_ERR(volume); ctx->volume = volume; + if (volume->type != AFSVL_RWVOL) { + ctx->flock_mode = afs_flock_mode_local; + fc->sb_flags |= SB_RDONLY; + } } return 0; diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index 488e58490b16..eb415ce56360 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -58,6 +58,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) } /* Status load is ordered after lookup counter load */ + if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) { + pr_warn("No record of cell %s\n", cell->name); + vc->error = -ENOENT; + return false; + } + if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { vc->error = -EDESTADDRREQ; return false; @@ -285,6 +291,7 @@ failed: */ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) { + struct afs_cell *cell = vc->cell; static int count; int i; @@ -294,6 +301,9 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) rcu_read_lock(); pr_notice("EDESTADDR occurred\n"); + pr_notice("CELL: %s err=%d\n", cell->name, cell->error); + pr_notice("DNS: src=%u st=%u lc=%x\n", + cell->dns_source, cell->dns_status, cell->dns_lookup_count); pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 29d483c80281..115c081a8e2c 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -32,8 +32,13 @@ static struct afs_volume *afs_insert_volume_into_cell(struct afs_cell *cell, } else if (p->vid > volume->vid) { pp = &(*pp)->rb_right; } else { - volume = afs_get_volume(p, afs_volume_trace_get_cell_insert); - goto found; + if (afs_try_get_volume(p, afs_volume_trace_get_cell_insert)) { + volume = p; + goto found; + } + + set_bit(AFS_VOLUME_RM_TREE, &volume->flags); + rb_replace_node_rcu(&p->cell_node, &volume->cell_node, &cell->volumes); } } @@ -56,7 +61,8 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) afs_volume_trace_remove); write_seqlock(&cell->volume_lock); hlist_del_rcu(&volume->proc_link); - rb_erase(&volume->cell_node, &cell->volumes); + if (!test_and_set_bit(AFS_VOLUME_RM_TREE, &volume->flags)) + rb_erase(&volume->cell_node, &cell->volumes); write_sequnlock(&cell->volume_lock); } } @@ -232,6 +238,20 @@ static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume) } /* + * Try to get a reference on a volume record. + */ +bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason) +{ + int r; + + if (__refcount_inc_not_zero(&volume->ref, &r)) { + trace_afs_volume(volume->vid, r + 1, reason); + return true; + } + return false; +} + +/* * Get a reference on a volume record. */ struct afs_volume *afs_get_volume(struct afs_volume *volume, diff --git a/fs/afs/write.c b/fs/afs/write.c index e1c45341719b..4a168781936b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -424,7 +424,7 @@ try_next_key: op->store.write_iter = iter; op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); - op->mtime = vnode->netfs.inode.i_mtime; + op->mtime = inode_get_mtime(&vnode->netfs.inode); afs_wait_for_operation(op); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 9048d8ccc715..64b2c0224f62 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -353,7 +353,7 @@ static const struct xattr_handler afs_xattr_afs_volume_handler = { .get = afs_xattr_get_volume, }; -const struct xattr_handler *afs_xattr_handlers[] = { +const struct xattr_handler * const afs_xattr_handlers[] = { &afs_xattr_afs_acl_handler, &afs_xattr_afs_cell_handler, &afs_xattr_afs_fid_handler, @@ -80,7 +80,7 @@ struct aio_ring { struct kioctx_table { struct rcu_head rcu; unsigned nr; - struct kioctx __rcu *table[]; + struct kioctx __rcu *table[] __counted_by(nr); }; struct kioctx_cpu { @@ -558,7 +558,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED, 0, &unused, NULL); + MAP_SHARED, 0, 0, &unused, NULL); mmap_write_unlock(mm); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; @@ -1447,13 +1447,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + kiocb_end_write(kiocb); } iocb->ki_res.res = res; @@ -1581,17 +1576,8 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * aio_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (S_ISREG(file_inode(file)->i_mode)) { - sb_start_write(file_inode(file)->i_sb); - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); - } + if (S_ISREG(file_inode(file)->i_mode)) + kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; aio_rw_done(req, call_write_iter(file, req, &iter)); } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 24192a7667ed..d26222b7eefe 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -24,8 +24,8 @@ #include <linux/uaccess.h> -static struct vfsmount *anon_inode_mnt __read_mostly; -static struct inode *anon_inode_inode; +static struct vfsmount *anon_inode_mnt __ro_after_init; +static struct inode *anon_inode_inode __ro_after_init; /* * anon_inodefs_dname() is called from d_path(). diff --git a/fs/attr.c b/fs/attr.c index d60dc1edb526..bdf5deb06ea9 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -308,11 +308,11 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, @@ -394,9 +394,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, return error; if ((ia_valid & ATTR_MODE)) { - umode_t amode = attr->ia_mode; + /* + * Don't allow changing the mode of symlinks: + * + * (1) The vfs doesn't take the mode of symlinks into account + * during permission checking. + * (2) This has never worked correctly. Most major filesystems + * did return EOPNOTSUPP due to interactions with POSIX ACLs + * but did still updated the mode of the symlink. + * This inconsistency led system call wrapper providers such + * as libc to block changing the mode of symlinks with + * EOPNOTSUPP already. + * (3) To even do this in the first place one would have to use + * specific file descriptors and quite some effort. + */ + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + /* Flag setting protected by i_mutex */ - if (is_sxid(amode)) + if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index d5a44fa88acf..8c1d587b3eef 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -25,6 +25,8 @@ #include <linux/completion.h> #include <linux/file.h> #include <linux/magic.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -205,20 +207,34 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry) /* Initializing function */ -int autofs_fill_super(struct super_block *, void *, int); +extern const struct fs_parameter_spec autofs_param_specs[]; +int autofs_init_fs_context(struct fs_context *fc); struct autofs_info *autofs_new_ino(struct autofs_sb_info *); void autofs_clean_ino(struct autofs_info *); -static inline int autofs_prepare_pipe(struct file *pipe) +static inline int autofs_check_pipe(struct file *pipe) { if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) return -EINVAL; + return 0; +} + +static inline void autofs_set_packet_pipe_flags(struct file *pipe) +{ /* We want a packet pipe */ pipe->f_flags |= O_DIRECT; /* We don't expect -EAGAIN */ pipe->f_flags &= ~O_NONBLOCK; +} + +static inline int autofs_prepare_pipe(struct file *pipe) +{ + int ret = autofs_check_pipe(pipe); + if (ret < 0) + return ret; + autofs_set_packet_pipe_flags(pipe); return 0; } diff --git a/fs/autofs/init.c b/fs/autofs/init.c index d3f55e874338..b5e4dfa04ed0 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -7,16 +7,11 @@ #include <linux/init.h> #include "autofs_i.h" -static struct dentry *autofs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_nodev(fs_type, flags, data, autofs_fill_super); -} - struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", - .mount = autofs_mount, + .init_fs_context = autofs_init_fs_context, + .parameters = autofs_param_specs, .kill_sb = autofs_kill_sb, }; MODULE_ALIAS_FS("autofs"); diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index affa70360b1f..1f5db6863663 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -6,7 +6,6 @@ #include <linux/seq_file.h> #include <linux/pagemap.h> -#include <linux/parser.h> #include "autofs_i.h" @@ -110,189 +109,179 @@ static const struct super_operations autofs_sops = { .evict_inode = autofs_evict_inode, }; -enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire, - Opt_ignore}; - -static const match_table_t tokens = { - {Opt_fd, "fd=%u"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pgrp, "pgrp=%u"}, - {Opt_minproto, "minproto=%u"}, - {Opt_maxproto, "maxproto=%u"}, - {Opt_indirect, "indirect"}, - {Opt_direct, "direct"}, - {Opt_offset, "offset"}, - {Opt_strictexpire, "strictexpire"}, - {Opt_ignore, "ignore"}, - {Opt_err, NULL} +enum { + Opt_direct, + Opt_fd, + Opt_gid, + Opt_ignore, + Opt_indirect, + Opt_maxproto, + Opt_minproto, + Opt_offset, + Opt_pgrp, + Opt_strictexpire, + Opt_uid, }; -static int parse_options(char *options, - struct inode *root, int *pgrp, bool *pgrp_set, - struct autofs_sb_info *sbi) +const struct fs_parameter_spec autofs_param_specs[] = { + fsparam_flag ("direct", Opt_direct), + fsparam_fd ("fd", Opt_fd), + fsparam_u32 ("gid", Opt_gid), + fsparam_flag ("ignore", Opt_ignore), + fsparam_flag ("indirect", Opt_indirect), + fsparam_u32 ("maxproto", Opt_maxproto), + fsparam_u32 ("minproto", Opt_minproto), + fsparam_flag ("offset", Opt_offset), + fsparam_u32 ("pgrp", Opt_pgrp), + fsparam_flag ("strictexpire", Opt_strictexpire), + fsparam_u32 ("uid", Opt_uid), + {} +}; + +struct autofs_fs_context { + kuid_t uid; + kgid_t gid; + int pgrp; + bool pgrp_set; +}; + +/* + * Open the fd. We do it here rather than in get_tree so that it's done in the + * context of the system call that passed the data and not the one that + * triggered the superblock creation, lest the fd gets reassigned. + */ +static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, + struct fs_parameter *param, + struct fs_parse_result *result) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - int pipefd = -1; - kuid_t uid; - kgid_t gid; + struct file *pipe; + int ret; - root->i_uid = current_uid(); - root->i_gid = current_gid(); + if (param->type == fs_value_is_file) { + /* came through the new api */ + pipe = param->file; + param->file = NULL; + } else { + pipe = fget(result->uint_32); + } + if (!pipe) { + errorf(fc, "could not open pipe file descriptor"); + return -EBADF; + } - sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; - sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; + ret = autofs_check_pipe(pipe); + if (ret < 0) { + errorf(fc, "Invalid/unusable pipe"); + if (param->type != fs_value_is_file) + fput(pipe); + return -EBADF; + } - sbi->pipefd = -1; + autofs_set_packet_pipe_flags(pipe); - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_fd: - if (match_int(args, &pipefd)) - return 1; - sbi->pipefd = pipefd; - break; - case Opt_uid: - if (match_int(args, &option)) - return 1; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return 1; - root->i_uid = uid; - break; - case Opt_gid: - if (match_int(args, &option)) - return 1; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return 1; - root->i_gid = gid; - break; - case Opt_pgrp: - if (match_int(args, &option)) - return 1; - *pgrp = option; - *pgrp_set = true; - break; - case Opt_minproto: - if (match_int(args, &option)) - return 1; - sbi->min_proto = option; - break; - case Opt_maxproto: - if (match_int(args, &option)) - return 1; - sbi->max_proto = option; - break; - case Opt_indirect: - set_autofs_type_indirect(&sbi->type); - break; - case Opt_direct: - set_autofs_type_direct(&sbi->type); - break; - case Opt_offset: - set_autofs_type_offset(&sbi->type); - break; - case Opt_strictexpire: - sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; - break; - case Opt_ignore: - sbi->flags |= AUTOFS_SBI_IGNORE; - break; - default: - return 1; - } + if (sbi->pipe) + fput(sbi->pipe); + + sbi->pipefd = result->uint_32; + sbi->pipe = pipe; + + return 0; +} + +static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct autofs_fs_context *ctx = fc->fs_private; + struct autofs_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + kuid_t uid; + kgid_t gid; + int opt; + + opt = fs_parse(fc, autofs_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_fd: + return autofs_parse_fd(fc, sbi, param, &result); + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalfc(fc, "Invalid uid"); + ctx->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalfc(fc, "Invalid gid"); + ctx->gid = gid; + break; + case Opt_pgrp: + ctx->pgrp = result.uint_32; + ctx->pgrp_set = true; + break; + case Opt_minproto: + sbi->min_proto = result.uint_32; + break; + case Opt_maxproto: + sbi->max_proto = result.uint_32; + break; + case Opt_indirect: + set_autofs_type_indirect(&sbi->type); + break; + case Opt_direct: + set_autofs_type_direct(&sbi->type); + break; + case Opt_offset: + set_autofs_type_offset(&sbi->type); + break; + case Opt_strictexpire: + sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; + break; + case Opt_ignore: + sbi->flags |= AUTOFS_SBI_IGNORE; } - return (sbi->pipefd < 0); + + return 0; } -int autofs_fill_super(struct super_block *s, void *data, int silent) +static struct autofs_sb_info *autofs_alloc_sbi(void) { - struct inode *root_inode; - struct dentry *root; - struct file *pipe; struct autofs_sb_info *sbi; - struct autofs_info *ino; - int pgrp = 0; - bool pgrp_set = false; - int ret = -EINVAL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - return -ENOMEM; - pr_debug("starting up, sbi = %p\n", sbi); + return NULL; - s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; - sbi->pipefd = -1; - sbi->pipe = NULL; - sbi->exp_timeout = 0; - sbi->oz_pgrp = NULL; - sbi->sb = s; - sbi->version = 0; - sbi->sub_version = 0; sbi->flags = AUTOFS_SBI_CATATONIC; + sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; + sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; + sbi->pipefd = -1; + set_autofs_type_indirect(&sbi->type); - sbi->min_proto = 0; - sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); - sbi->queues = NULL; spin_lock_init(&sbi->lookup_lock); INIT_LIST_HEAD(&sbi->active_list); INIT_LIST_HEAD(&sbi->expiring_list); - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs_sops; - s->s_d_op = &autofs_dentry_operations; - s->s_time_gran = 1; - - /* - * Get the root inode and dentry, but defer checking for errors. - */ - ino = autofs_new_ino(sbi); - if (!ino) { - ret = -ENOMEM; - goto fail_free; - } - root_inode = autofs_get_inode(s, S_IFDIR | 0755); - root = d_make_root(root_inode); - if (!root) { - ret = -ENOMEM; - goto fail_ino; - } - pipe = NULL; - root->d_fsdata = ino; + return sbi; +} - /* Can this call block? */ - if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) { - pr_err("called with bogus options\n"); - goto fail_dput; - } +static int autofs_validate_protocol(struct fs_context *fc) +{ + struct autofs_sb_info *sbi = fc->s_fs_info; /* Test versions first */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - pr_err("kernel does not match daemon version " + errorf(fc, "kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); - goto fail_dput; + return -EINVAL; } /* Establish highest kernel protocol version */ @@ -300,62 +289,148 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) sbi->version = AUTOFS_MAX_PROTO_VERSION; else sbi->version = sbi->max_proto; - sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - - if (pgrp_set) { - sbi->oz_pgrp = find_get_pid(pgrp); - if (!sbi->oz_pgrp) { - pr_err("could not find process group %d\n", - pgrp); - goto fail_dput; - } - } else { - sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + + switch (sbi->version) { + case 4: + sbi->sub_version = 7; + break; + case 5: + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + break; + default: + sbi->sub_version = 0; } - if (autofs_type_trigger(sbi->type)) - __managed_dentry_set_managed(root); + return 0; +} +static int autofs_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct autofs_fs_context *ctx = fc->fs_private; + struct autofs_sb_info *sbi = s->s_fs_info; + struct inode *root_inode; + struct autofs_info *ino; + + pr_debug("starting up, sbi = %p\n", sbi); + + sbi->sb = s; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = AUTOFS_SUPER_MAGIC; + s->s_op = &autofs_sops; + s->s_d_op = &autofs_dentry_operations; + s->s_time_gran = 1; + + /* + * Get the root inode and dentry, but defer checking for errors. + */ + ino = autofs_new_ino(sbi); + if (!ino) + return -ENOMEM; + + root_inode = autofs_get_inode(s, S_IFDIR | 0755); + if (!root_inode) + return -ENOMEM; + + root_inode->i_uid = ctx->uid; + root_inode->i_gid = ctx->gid; root_inode->i_fop = &autofs_root_operations; root_inode->i_op = &autofs_dir_inode_operations; + s->s_root = d_make_root(root_inode); + if (unlikely(!s->s_root)) { + autofs_free_ino(ino); + return -ENOMEM; + } + s->s_root->d_fsdata = ino; + + if (ctx->pgrp_set) { + sbi->oz_pgrp = find_get_pid(ctx->pgrp); + if (!sbi->oz_pgrp) + return invalf(fc, "Could not find process group %d", + ctx->pgrp); + } else + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + + if (autofs_type_trigger(sbi->type)) + /* s->s_root won't be contended so there's little to + * be gained by not taking the d_lock when setting + * d_flags, even when a lot mounts are being done. + */ + managed_dentry_set_managed(s->s_root); + pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); - pipe = fget(sbi->pipefd); - if (!pipe) { - pr_err("could not open pipe file descriptor\n"); - goto fail_put_pid; - } - ret = autofs_prepare_pipe(pipe); - if (ret < 0) - goto fail_fput; - sbi->pipe = pipe; sbi->flags &= ~AUTOFS_SBI_CATATONIC; + return 0; +} - /* - * Success! Install the root dentry now to indicate completion. - */ - s->s_root = root; +/* + * Validate the parameters and then request a superblock. + */ +static int autofs_get_tree(struct fs_context *fc) +{ + struct autofs_sb_info *sbi = fc->s_fs_info; + int ret; + + ret = autofs_validate_protocol(fc); + if (ret) + return ret; + + if (sbi->pipefd < 0) + return invalf(fc, "No control pipe specified"); + + return get_tree_nodev(fc, autofs_fill_super); +} + +static void autofs_free_fc(struct fs_context *fc) +{ + struct autofs_fs_context *ctx = fc->fs_private; + struct autofs_sb_info *sbi = fc->s_fs_info; + + if (sbi) { + if (sbi->pipe) + fput(sbi->pipe); + kfree(sbi); + } + kfree(ctx); +} + +static const struct fs_context_operations autofs_context_ops = { + .free = autofs_free_fc, + .parse_param = autofs_parse_param, + .get_tree = autofs_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +int autofs_init_fs_context(struct fs_context *fc) +{ + struct autofs_fs_context *ctx; + struct autofs_sb_info *sbi; + + ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL); + if (!ctx) + goto nomem; + + ctx->uid = current_uid(); + ctx->gid = current_gid(); + + sbi = autofs_alloc_sbi(); + if (!sbi) + goto nomem_ctx; + + fc->fs_private = ctx; + fc->s_fs_info = sbi; + fc->ops = &autofs_context_ops; return 0; - /* - * Failure ... clean up. - */ -fail_fput: - pr_err("pipe file descriptor does not contain proper ops\n"); - fput(pipe); -fail_put_pid: - put_pid(sbi->oz_pgrp); -fail_dput: - dput(root); - goto fail_free; -fail_ino: - autofs_free_ino(ino); -fail_free: - kfree(sbi); - s->s_fs_info = NULL; - return ret; +nomem_ctx: + kfree(ctx); +nomem: + return -ENOMEM; } struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) @@ -370,7 +445,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 93046c9dc461..530d18827e35 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return 0; } @@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); @@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return 0; } diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 54c1f8b8b075..33dd4660d82f 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -32,8 +32,9 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) wq->status = -ENOENT; /* Magic is gone - report failure */ kfree(wq->name.name - wq->offset); wq->name.name = NULL; - wq->wait_ctr--; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); + if (!--wq->wait_ctr) + kfree(wq); wq = nwq; } fput(sbi->pipe); /* Close the pipe */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index db649487d58c..316d88da2ce1 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -133,8 +133,7 @@ static int bad_inode_fiemap(struct inode *inode, return -EIO; } -static int bad_inode_update_time(struct inode *inode, struct timespec64 *time, - int flags) +static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } @@ -209,8 +208,7 @@ void make_bad_inode(struct inode *inode) remove_inode_hash(inode); inode->i_mode = S_IFREG; - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 index 000000000000..fddc7be58022 --- /dev/null +++ b/fs/bcachefs/Kconfig @@ -0,0 +1,95 @@ + +config BCACHEFS_FS + tristate "bcachefs filesystem support (EXPERIMENTAL)" + depends on BLOCK + select EXPORTFS + select CLOSURES + select LIBCRC32C + select CRC64 + select FS_POSIX_ACL + select LZ4_COMPRESS + select LZ4_DECOMPRESS + select LZ4HC_COMPRESS + select LZ4HC_DECOMPRESS + select ZLIB_DEFLATE + select ZLIB_INFLATE + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + select CRYPTO_SHA256 + select CRYPTO_CHACHA20 + select CRYPTO_POLY1305 + select KEYS + select RAID6_PQ + select XOR_BLOCKS + select XXHASH + select SRCU + select SYMBOLIC_ERRNAME + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. + +config BCACHEFS_QUOTA + bool "bcachefs quota support" + depends on BCACHEFS_FS + select QUOTACTL + +config BCACHEFS_ERASURE_CODING + bool "bcachefs erasure coding (RAID5/6) support (EXPERIMENTAL)" + depends on BCACHEFS_FS + select QUOTACTL + help + This enables the "erasure_code" filesysystem and inode option, which + organizes data into reed-solomon stripes instead of ordinary + replication. + + WARNING: this feature is still undergoing on disk format changes, and + should only be enabled for testing purposes. + +config BCACHEFS_POSIX_ACL + bool "bcachefs POSIX ACL support" + depends on BCACHEFS_FS + select FS_POSIX_ACL + +config BCACHEFS_DEBUG_TRANSACTIONS + bool "bcachefs runtime info" + depends on BCACHEFS_FS + help + This makes the list of running btree transactions available in debugfs. + + This is a highly useful debugging feature but does add a small amount of overhead. + +config BCACHEFS_DEBUG + bool "bcachefs debugging" + depends on BCACHEFS_FS + help + Enables many extra debugging checks and assertions. + + The resulting code will be significantly slower than normal; you + probably shouldn't select this option unless you're a developer. + +config BCACHEFS_TESTS + bool "bcachefs unit and performance tests" + depends on BCACHEFS_FS + help + Include some unit and performance tests for the core btree code + +config BCACHEFS_LOCK_TIME_STATS + bool "bcachefs lock time statistics" + depends on BCACHEFS_FS + help + Expose statistics for how long we held a lock in debugfs + +config BCACHEFS_NO_LATENCY_ACCT + bool "disable latency accounting and time stats" + depends on BCACHEFS_FS + help + This disables device latency tracking and time stats, only for performance testing + +config MEAN_AND_VARIANCE_UNIT_TEST + tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on BCACHEFS_FS + default KUNIT_ALL_TESTS + help + This option enables the kunit tests for mean_and_variance module. + If unsure, say N. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile new file mode 100644 index 000000000000..45b64f89258c --- /dev/null +++ b/fs/bcachefs/Makefile @@ -0,0 +1,89 @@ + +obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + +bcachefs-y := \ + acl.o \ + alloc_background.o \ + alloc_foreground.o \ + backpointers.o \ + bkey.o \ + bkey_methods.o \ + bkey_sort.o \ + bset.o \ + btree_cache.o \ + btree_gc.o \ + btree_io.o \ + btree_iter.o \ + btree_journal_iter.o \ + btree_key_cache.o \ + btree_locking.o \ + btree_trans_commit.o \ + btree_update.o \ + btree_update_interior.o \ + btree_write_buffer.o \ + buckets.o \ + buckets_waiting_for_journal.o \ + chardev.o \ + checksum.o \ + clock.o \ + compress.o \ + counters.o \ + debug.o \ + dirent.o \ + disk_groups.o \ + data_update.o \ + ec.o \ + errcode.o \ + error.o \ + extents.o \ + extent_update.o \ + fs.o \ + fs-common.o \ + fs-ioctl.o \ + fs-io.o \ + fs-io-buffered.o \ + fs-io-direct.o \ + fs-io-pagecache.o \ + fsck.o \ + inode.o \ + io_read.o \ + io_misc.o \ + io_write.o \ + journal.o \ + journal_io.o \ + journal_reclaim.o \ + journal_sb.o \ + journal_seq_blacklist.o \ + keylist.o \ + logged_ops.o \ + lru.o \ + mean_and_variance.o \ + migrate.o \ + move.o \ + movinggc.o \ + nocow_locking.o \ + opts.o \ + printbuf.o \ + quota.o \ + rebalance.o \ + recovery.o \ + reflink.o \ + replicas.o \ + sb-clean.o \ + sb-errors.o \ + sb-members.o \ + siphash.o \ + six.o \ + snapshot.o \ + subvolume.o \ + super.o \ + super-io.o \ + sysfs.o \ + tests.o \ + trace.o \ + two_state_shared_lock.o \ + util.o \ + varint.o \ + xattr.o + +obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c new file mode 100644 index 000000000000..f3809897f00a --- /dev/null +++ b/fs/bcachefs/acl.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" + +#include "acl.h" +#include "xattr.h" + +#include <linux/posix_acl.h> + +static const char * const acl_types[] = { + [ACL_USER_OBJ] = "user_obj", + [ACL_USER] = "user", + [ACL_GROUP_OBJ] = "group_obj", + [ACL_GROUP] = "group", + [ACL_MASK] = "mask", + [ACL_OTHER] = "other", + NULL, +}; + +void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size) +{ + const void *p, *end = value + size; + + if (!value || + size < sizeof(bch_acl_header) || + ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION)) + return; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *in = p; + unsigned tag = le16_to_cpu(in->e_tag); + + prt_str(out, acl_types[tag]); + + switch (tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + prt_printf(out, " uid %u", le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + prt_printf(out, " gid %u", le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + } + + prt_printf(out, " %o", le16_to_cpu(in->e_perm)); + + if (p != end) + prt_char(out, ' '); + } +} + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + +#include "fs.h" + +#include <linux/fs.h> +#include <linux/posix_acl_xattr.h> +#include <linux/sched.h> +#include <linux/slab.h> + +static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) +{ + return sizeof(bch_acl_header) + + sizeof(bch_acl_entry_short) * nr_short + + sizeof(bch_acl_entry) * nr_long; +} + +static inline int acl_to_xattr_type(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; + default: + BUG(); + } +} + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, + const void *value, size_t size) +{ + const void *p, *end = value + size; + struct posix_acl *acl; + struct posix_acl_entry *out; + unsigned count = 0; + int ret; + + if (!value) + return NULL; + if (size < sizeof(bch_acl_header)) + goto invalid; + if (((bch_acl_header *)value)->a_version != + cpu_to_le32(BCH_ACL_VERSION)) + goto invalid; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *entry = p; + + if (p + sizeof(bch_acl_entry_short) > end) + goto invalid; + + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + case ACL_GROUP: + p += sizeof(bch_acl_entry); + break; + default: + goto invalid; + } + + count++; + } + + if (p > end) + goto invalid; + + if (!count) + return NULL; + + acl = allocate_dropping_locks(trans, ret, + posix_acl_alloc(count, _gfp)); + if (!acl) + return ERR_PTR(-ENOMEM); + if (ret) { + kfree(acl); + return ERR_PTR(ret); + } + + out = acl->a_entries; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *in = p; + + out->e_tag = le16_to_cpu(in->e_tag); + out->e_perm = le16_to_cpu(in->e_perm); + + switch (out->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + out->e_uid = make_kuid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + out->e_gid = make_kgid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + } + + out++; + } + + BUG_ON(out != acl->a_entries + acl->a_count); + + return acl; +invalid: + pr_err("invalid acl entry"); + return ERR_PTR(-EINVAL); +} + +#define acl_for_each_entry(acl, acl_e) \ + for (acl_e = acl->a_entries; \ + acl_e < acl->a_entries + acl->a_count; \ + acl_e++) + +/* + * Convert from in-memory to filesystem representation. + */ +static struct bkey_i_xattr * +bch2_acl_to_xattr(struct btree_trans *trans, + const struct posix_acl *acl, + int type) +{ + struct bkey_i_xattr *xattr; + bch_acl_header *acl_header; + const struct posix_acl_entry *acl_e; + void *outptr; + unsigned nr_short = 0, nr_long = 0, acl_len, u64s; + + acl_for_each_entry(acl, acl_e) { + switch (acl_e->e_tag) { + case ACL_USER: + case ACL_GROUP: + nr_long++; + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + nr_short++; + break; + default: + return ERR_PTR(-EINVAL); + } + } + + acl_len = bch2_acl_size(nr_short, nr_long); + u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); + + if (u64s > U8_MAX) + return ERR_PTR(-E2BIG); + + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(xattr)) + return xattr; + + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = acl_to_xattr_type(type); + xattr->v.x_name_len = 0; + xattr->v.x_val_len = cpu_to_le16(acl_len); + + acl_header = xattr_val(&xattr->v); + acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); + + outptr = (void *) acl_header + sizeof(*acl_header); + + acl_for_each_entry(acl, acl_e) { + bch_acl_entry *entry = outptr; + + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + outptr += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + outptr += sizeof(bch_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + outptr += sizeof(bch_acl_entry_short); + break; + } + } + + BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); + + return xattr; +} + +struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) +{ + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; + struct bkey_s_c_xattr xattr; + struct posix_acl *acl = NULL; + struct bkey_s_c k; + int ret; +retry: + bch2_trans_begin(trans); + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &search, 0); + if (ret) { + if (!bch2_err_matches(ret, ENOENT)) + acl = ERR_PTR(ret); + goto out; + } + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + acl = ERR_PTR(ret); + goto out; + } + + xattr = bkey_s_c_to_xattr(k); + acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + + if (!IS_ERR(acl)) + set_cached_acl(&inode->v, type, acl); +out: + if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return acl; +} + +int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, + struct posix_acl *acl, int type) +{ + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); + int ret; + + if (type == ACL_TYPE_DEFAULT && + !S_ISDIR(inode_u->bi_mode)) + return acl ? -EACCES : 0; + + if (acl) { + struct bkey_i_xattr *xattr = + bch2_acl_to_xattr(trans, acl, type); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); + + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, + inum, &xattr->k_i, 0); + } else { + struct xattr_search_key search = + X_SEARCH(acl_to_xattr_type(type), "", 0); + + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, + inum, &search); + } + + return bch2_err_matches(ret, ENOENT) ? 0 : ret; +} + +int bch2_set_acl(struct mnt_idmap *idmap, + struct dentry *dentry, + struct posix_acl *_acl, int type) +{ + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl; + umode_t mode; + int ret; + + mutex_lock(&inode->ei_update_lock); +retry: + bch2_trans_begin(trans); + acl = _acl; + + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); + if (ret) + goto btree_err; + + mode = inode_u.bi_mode; + + if (type == ACL_TYPE_ACCESS) { + ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); + if (ret) + goto btree_err; + } + + ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type); + if (ret) + goto btree_err; + + inode_u.bi_ctime = bch2_current_time(c); + inode_u.bi_mode = mode; + + ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +btree_err: + bch2_trans_iter_exit(trans, &inode_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) + goto err; + + bch2_inode_update_after_write(trans, inode, &inode_u, + ATTR_CTIME|ATTR_MODE); + + set_cached_acl(&inode->v, type, acl); +err: + mutex_unlock(&inode->ei_update_lock); + bch2_trans_put(trans); + + return ret; +} + +int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) +{ + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); + struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); + struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; + struct posix_acl *acl = NULL; + struct bkey_s_c k; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &search, BTREE_ITER_INTENT); + if (ret) + return bch2_err_matches(ret, ENOENT) ? 0 : ret; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + xattr = bkey_s_c_to_xattr(k); + + acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); + if (IS_ERR_OR_NULL(acl)) + goto err; + + ret = allocate_dropping_locks_errcode(trans, + __posix_acl_chmod(&acl, _gfp, mode)); + if (ret) + goto err; + + new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto err; + } + + new->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); + *new_acl = acl; + acl = NULL; +err: + bch2_trans_iter_exit(trans, &iter); + if (!IS_ERR_OR_NULL(acl)) + kfree(acl); + return ret; +} + +#endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h new file mode 100644 index 000000000000..27e7eec0f278 --- /dev/null +++ b/fs/bcachefs/acl.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ACL_H +#define _BCACHEFS_ACL_H + +struct bch_inode_unpacked; +struct bch_hash_info; +struct bch_inode_info; +struct posix_acl; + +#define BCH_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} bch_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} bch_acl_entry_short; + +typedef struct { + __le32 a_version; +} bch_acl_header; + +void bch2_acl_to_text(struct printbuf *, const void *, size_t); + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + +struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); + +int bch2_set_acl_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct posix_acl *, int); +int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); +int bch2_acl_chmod(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + umode_t, struct posix_acl **); + +#else + +static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, + struct posix_acl *acl, int type) +{ + return 0; +} + +static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) +{ + return 0; +} + +#endif /* CONFIG_BCACHEFS_POSIX_ACL */ + +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 index 000000000000..1fec0e67891f --- /dev/null +++ b/fs/bcachefs/alloc_background.c @@ -0,0 +1,2159 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_gc.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "buckets_waiting_for_journal.h" +#include "clock.h" +#include "debug.h" +#include "ec.h" +#include "error.h" +#include "lru.h" +#include "recovery.h" +#include "trace.h" +#include "varint.h" + +#include <linux/kthread.h> +#include <linux/math64.h> +#include <linux/random.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/sched/task.h> +#include <linux/sort.h> + +/* Persistent alloc info: */ + +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bkey_alloc_unpacked { + u64 journal_seq; + u8 gen; + u8 oldest_gen; + u8 data_type; + bool need_discard:1; + bool need_inc_gen:1; +#define x(_name, _bits) u##_bits _name; + BCH_ALLOC_FIELDS_V2() +#undef x +}; + +static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) +{ + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; + u64 v; + + if (!(a->fields & (1 << field))) + return 0; + + switch (bytes) { + case 1: + v = *((const u8 *) *p); + break; + case 2: + v = le16_to_cpup(*p); + break; + case 4: + v = le32_to_cpup(*p); + break; + case 8: + v = le64_to_cpup(*p); + break; + default: + BUG(); + } + + *p += bytes; + return v; +} + +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; + const void *d = in->data; + unsigned idx = 0; + + out->gen = in->gen; + +#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); + BCH_ALLOC_FIELDS_V1() +#undef x +} + +static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); + out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); + out->journal_seq = le64_to_cpu(a.v->journal_seq); + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +{ + struct bkey_alloc_unpacked ret = { .gen = 0 }; + + switch (k.k->type) { + case KEY_TYPE_alloc: + bch2_alloc_unpack_v1(&ret, k); + break; + case KEY_TYPE_alloc_v2: + bch2_alloc_unpack_v2(&ret, k); + break; + case KEY_TYPE_alloc_v3: + bch2_alloc_unpack_v3(&ret, k); + break; + } + + return ret; +} + +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +{ + unsigned i, bytes = offsetof(struct bch_alloc, data); + + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) + if (a->fields & (1 << i)) + bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; + + return DIV_ROUND_UP(bytes, sizeof(u64)); +} + +int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + int ret = 0; + + /* allow for unknown fields */ + bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, + alloc_v1_val_size_bad, + "incorrect value size (%zu < %u)", + bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); +fsck_err: + return ret; +} + +int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_alloc_unpacked u; + int ret = 0; + + bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, + alloc_v2_unpack_error, + "unpack error"); +fsck_err: + return ret; +} + +int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_alloc_unpacked u; + int ret = 0; + + bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, + alloc_v2_unpack_error, + "unpack error"); +fsck_err: + return ret; +} + +int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + int ret = 0; + + bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err, + alloc_v4_val_size_bad, + "bad val size (%u > %zu)", + alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); + + bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, + alloc_v4_backpointers_start_bad, + "invalid backpointers_start"); + + bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, + alloc_key_data_type_bad, + "invalid data type (got %u should be %u)", + a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + + switch (a.v->data_type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + bkey_fsck_err_on(a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe, c, err, + alloc_key_empty_but_have_data, + "empty data type free but have data"); + break; + case BCH_DATA_sb: + case BCH_DATA_journal: + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + bkey_fsck_err_on(!a.v->dirty_sectors, c, err, + alloc_key_dirty_sectors_0, + "data_type %s but dirty_sectors==0", + bch2_data_types[a.v->data_type]); + break; + case BCH_DATA_cached: + bkey_fsck_err_on(!a.v->cached_sectors || + a.v->dirty_sectors || + a.v->stripe, c, err, + alloc_key_cached_inconsistency, + "data type inconsistency"); + + bkey_fsck_err_on(!a.v->io_time[READ] && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + c, err, + alloc_key_cached_but_read_time_zero, + "cached bucket with read_time == 0"); + break; + case BCH_DATA_stripe: + break; + } +fsck_err: + return ret; +} + +void bch2_alloc_v4_swab(struct bkey_s k) +{ + struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; + struct bch_backpointer *bp, *bps; + + a->journal_seq = swab64(a->journal_seq); + a->flags = swab32(a->flags); + a->dirty_sectors = swab32(a->dirty_sectors); + a->cached_sectors = swab32(a->cached_sectors); + a->io_time[0] = swab64(a->io_time[0]); + a->io_time[1] = swab64(a->io_time[1]); + a->stripe = swab32(a->stripe); + a->nr_external_backpointers = swab32(a->nr_external_backpointers); + a->fragmentation_lru = swab64(a->fragmentation_lru); + + bps = alloc_v4_backpointers(a); + for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { + bp->bucket_offset = swab40(bp->bucket_offset); + bp->bucket_len = swab32(bp->bucket_len); + bch2_bpos_swab(&bp->pos); + } +} + +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); + unsigned i; + + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "gen %u oldest_gen %u data_type %s", + a->gen, a->oldest_gen, + a->data_type < BCH_DATA_NR + ? bch2_data_types[a->data_type] + : "(invalid data type)"); + prt_newline(out); + prt_printf(out, "journal_seq %llu", a->journal_seq); + prt_newline(out); + prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_newline(out); + prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_newline(out); + prt_printf(out, "dirty_sectors %u", a->dirty_sectors); + prt_newline(out); + prt_printf(out, "cached_sectors %u", a->cached_sectors); + prt_newline(out); + prt_printf(out, "stripe %u", a->stripe); + prt_newline(out); + prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); + prt_newline(out); + prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); + prt_newline(out); + prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); + prt_newline(out); + prt_printf(out, "fragmentation %llu", a->fragmentation_lru); + prt_newline(out); + prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_newline(out); + + if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { + struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); + const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); + + prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); + printbuf_indent_add(out, 2); + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { + prt_newline(out); + bch2_backpointer_to_text(out, &bps[i]); + } + + printbuf_indent_sub(out, 2); + } + + printbuf_indent_sub(out, 2); +} + +void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) +{ + if (k.k->type == KEY_TYPE_alloc_v4) { + void *src, *dst; + + *out = *bkey_s_c_to_alloc_v4(k).v; + + src = alloc_v4_backpointers(out); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + dst = alloc_v4_backpointers(out); + + if (src < dst) + memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); + } else { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + *out = (struct bch_alloc_v4) { + .journal_seq = u.journal_seq, + .flags = u.need_discard, + .gen = u.gen, + .oldest_gen = u.oldest_gen, + .data_type = u.data_type, + .stripe_redundancy = u.stripe_redundancy, + .dirty_sectors = u.dirty_sectors, + .cached_sectors = u.cached_sectors, + .io_time[READ] = u.read_time, + .io_time[WRITE] = u.write_time, + .stripe = u.stripe, + }; + + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + } +} + +static noinline struct bkey_i_alloc_v4 * +__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_i_alloc_v4 *ret; + + ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); + if (IS_ERR(ret)) + return ret; + + if (k.k->type == KEY_TYPE_alloc_v4) { + void *src, *dst; + + bkey_reassemble(&ret->k_i, k); + + src = alloc_v4_backpointers(&ret->v); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); + dst = alloc_v4_backpointers(&ret->v); + + if (src < dst) + memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); + set_alloc_v4_u64s(ret); + } else { + bkey_alloc_v4_init(&ret->k_i); + ret->k.p = k.k->p; + bch2_alloc_to_v4(k, &ret->v); + } + return ret; +} + +static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v4 a; + + if (likely(k.k->type == KEY_TYPE_alloc_v4) && + ((a = bkey_s_c_to_alloc_v4(k), true) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) + return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); + + return __bch2_alloc_to_v4_mut(trans, k); +} + +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ + return bch2_alloc_to_v4_mut_inlined(trans, k); +} + +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) +{ + struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + int ret; + + k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bkey_err(k); + if (unlikely(ret)) + return ERR_PTR(ret); + + a = bch2_alloc_to_v4_mut_inlined(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (unlikely(ret)) + goto err; + return a; +err: + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); +} + +static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) +{ + *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; + + pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; + return pos; +} + +static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) +{ + pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; + pos.offset += offset; + return pos; +} + +static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) +{ + return k.k->type == KEY_TYPE_bucket_gens + ? bkey_s_c_to_bucket_gens(k).v->gens[offset] + : 0; +} + +int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, + bucket_gens_val_size_bad, + "bad val size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); +fsck_err: + return ret; +} + +void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { + if (i) + prt_char(out, ' '); + prt_printf(out, "%u", g.v->gens[i]); + } +} + +int bch2_bucket_gens_init(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a; + struct bkey_i_bucket_gens g; + bool have_bucket_gens_key = false; + unsigned offset; + struct bpos pos; + u8 gen; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_bucket_exists(c, k.k->p)) + continue; + + gen = bch2_alloc_to_v4(k, &a)->gen; + pos = alloc_gens_pos(iter.pos, &offset); + + if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + if (ret) + break; + have_bucket_gens_key = false; + } + + if (!have_bucket_gens_key) { + bkey_bucket_gens_init(&g.k_i); + g.k.p = pos; + have_bucket_gens_key = true; + } + + g.v.gens[offset] = gen; + } + bch2_trans_iter_exit(trans, &iter); + + if (have_bucket_gens_key && !ret) + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + + bch2_trans_put(trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +int bch2_alloc_read(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + int ret; + + down_read(&c->gc_lock); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { + const struct bch_bucket_gens *g; + u64 b; + + for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + + if (k.k->type != KEY_TYPE_bucket_gens) + continue; + + g = bkey_s_c_to_bucket_gens(k).v; + + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_exists2(c, k.k->p.inode)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + for (b = max_t(u64, ca->mi.first_bucket, start); + b < min_t(u64, ca->mi.nbuckets, end); + b++) + *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; + } + bch2_trans_iter_exit(trans, &iter); + } else { + struct bch_alloc_v4 a; + + for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_bucket_exists(c, k.k->p)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; + } + bch2_trans_iter_exit(trans, &iter); + } + + bch2_trans_put(trans); + up_read(&c->gc_lock); + + if (ret) + bch_err_fn(c, ret); + + return ret; +} + +/* Free space/discard btree: */ + +static int bch2_bucket_do_index(struct btree_trans *trans, + struct bkey_s_c alloc_k, + const struct bch_alloc_v4 *a, + bool set) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); + struct btree_iter iter; + struct bkey_s_c old; + struct bkey_i *k; + enum btree_id btree; + enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; + enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + struct printbuf buf = PRINTBUF; + int ret; + + if (a->data_type != BCH_DATA_free && + a->data_type != BCH_DATA_need_discard) + return 0; + + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.type = new_type; + + switch (a->data_type) { + case BCH_DATA_free: + btree = BTREE_ID_freespace; + k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); + bch2_key_resize(&k->k, 1); + break; + case BCH_DATA_need_discard: + btree = BTREE_ID_need_discard; + k->k.p = alloc_k.k->p; + break; + default: + return 0; + } + + old = bch2_bkey_get_iter(trans, &iter, btree, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + ret = bkey_err(old); + if (ret) + return ret; + + if (ca->mi.freespace_initialized && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && + bch2_trans_inconsistent_on(old.k->type != old_type, trans, + "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" + " for %s", + set ? "setting" : "clearing", + bch2_btree_id_str(btree), + iter.pos.inode, + iter.pos.offset, + bch2_bkey_types[old.k->type], + bch2_bkey_types[old_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = -EIO; + goto err; + } + + ret = bch2_trans_update(trans, &iter, k, 0); +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static noinline int bch2_bucket_gen_update(struct btree_trans *trans, + struct bpos bucket, u8 gen) +{ + struct btree_iter iter; + unsigned offset; + struct bpos pos = alloc_gens_pos(bucket, &offset); + struct bkey_i_bucket_gens *g; + struct bkey_s_c k; + int ret; + + g = bch2_trans_kmalloc(trans, sizeof(*g)); + ret = PTR_ERR_OR_ZERO(g); + if (ret) + return ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_bucket_gens) { + bkey_bucket_gens_init(&g->k_i); + g->k.p = iter.pos; + } else { + bkey_reassemble(&g->k_i, k); + } + + g->v.gens[offset] = gen; + + ret = bch2_trans_update(trans, &iter, &g->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_trans_mark_alloc(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_alloc_v4 old_a_convert, *new_a; + const struct bch_alloc_v4 *old_a; + u64 old_lru, new_lru; + int ret = 0; + + /* + * Deletion only happens in the device removal path, with + * BTREE_TRIGGER_NORUN: + */ + BUG_ON(new->k.type != KEY_TYPE_alloc_v4); + + old_a = bch2_alloc_to_v4(old, &old_a_convert); + new_a = &bkey_i_to_alloc_v4(new)->v; + + new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + + if (new_a->dirty_sectors > old_a->dirty_sectors || + new_a->cached_sectors > old_a->cached_sectors) { + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); + SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); + } + + if (data_type_is_empty(new_a->data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(new_a) && + !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { + new_a->gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } + + if (old_a->data_type != new_a->data_type || + (new_a->data_type == BCH_DATA_free && + alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, old_a, false) ?: + bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); + if (ret) + return ret; + } + + if (new_a->data_type == BCH_DATA_cached && + !new_a->io_time[READ]) + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + + old_lru = alloc_lru_idx_read(*old_a); + new_lru = alloc_lru_idx_read(*new_a); + + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new->k.p.inode, + bucket_to_u64(new->k.p), + old_lru, new_lru); + if (ret) + return ret; + } + + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, + bch_dev_bkey_exists(c, new->k.p.inode)); + + if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new->k.p), + old_a->fragmentation_lru, new_a->fragmentation_lru); + if (ret) + return ret; + } + + if (old_a->gen != new_a->gen) { + ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); + if (ret) + return ret; + } + + return 0; +} + +/* + * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * extents style btrees, but works on non-extents btrees: + */ +static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +{ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + + if (bkey_err(k)) + return k; + + if (k.k->type) { + return k; + } else { + struct btree_iter iter2; + struct bpos next; + + bch2_trans_copy_iter(&iter2, iter); + + if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + + end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); + + /* + * btree node min/max is a closed interval, upto takes a half + * open interval: + */ + k = bch2_btree_iter_peek_upto(&iter2, end); + next = iter2.pos; + bch2_trans_iter_exit(iter->trans, &iter2); + + BUG_ON(next.offset >= iter->pos.offset + U32_MAX); + + if (bkey_err(k)) + return k; + + bkey_init(hole); + hole->p = iter->pos; + + bch2_key_resize(hole, next.offset - iter->pos.offset); + return (struct bkey_s_c) { hole, NULL }; + } +} + +static bool next_bucket(struct bch_fs *c, struct bpos *bucket) +{ + struct bch_dev *ca; + unsigned iter; + + if (bch2_dev_bucket_exists(c, *bucket)) + return true; + + if (bch2_dev_exists2(c, bucket->inode)) { + ca = bch_dev_bkey_exists(c, bucket->inode); + + if (bucket->offset < ca->mi.first_bucket) { + bucket->offset = ca->mi.first_bucket; + return true; + } + + bucket->inode++; + bucket->offset = 0; + } + + rcu_read_lock(); + iter = bucket->inode; + ca = __bch2_next_dev(c, &iter, NULL); + if (ca) + *bucket = POS(ca->dev_idx, ca->mi.first_bucket); + rcu_read_unlock(); + + return ca != NULL; +} + +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +{ + struct bch_fs *c = iter->trans->c; + struct bkey_s_c k; +again: + k = bch2_get_key_or_hole(iter, POS_MAX, hole); + if (bkey_err(k)) + return k; + + if (!k.k->type) { + struct bpos bucket = bkey_start_pos(k.k); + + if (!bch2_dev_bucket_exists(c, bucket)) { + if (!next_bucket(c, &bucket)) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bucket); + goto again; + } + + if (!bch2_dev_bucket_exists(c, k.k->p)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + + bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); + } + } + + return k; +} + +static noinline_for_stack +int bch2_check_alloc_key(struct btree_trans *trans, + struct bkey_s_c alloc_k, + struct btree_iter *alloc_iter, + struct btree_iter *discard_iter, + struct btree_iter *freespace_iter, + struct btree_iter *bucket_gens_iter) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + unsigned discard_key_type, freespace_key_type; + unsigned gens_offset; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, + alloc_key_to_missing_dev_bucket, + "alloc key for invalid device:bucket %llu:%llu", + alloc_k.k->p.inode, alloc_k.k->p.offset)) + return bch2_btree_delete_at(trans, alloc_iter, 0); + + ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); + if (!ca->mi.freespace_initialized) + return 0; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + + discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; + bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); + k = bch2_btree_iter_peek_slot(discard_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != discard_key_type && + (c->opts.reconstruct_alloc || + fsck_err(c, need_discard_key_wrong, + "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = discard_key_type; + update->k.p = discard_iter->pos; + + ret = bch2_trans_update(trans, discard_iter, update, 0); + if (ret) + goto err; + } + + freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; + bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); + k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != freespace_key_type && + (c->opts.reconstruct_alloc || + fsck_err(c, freespace_key_wrong, + "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = freespace_key_type; + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, 1); + + ret = bch2_trans_update(trans, freespace_iter, update, 0); + if (ret) + goto err; + } + + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); + k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (a->gen != alloc_gen(k, gens_offset) && + (c->opts.reconstruct_alloc || + fsck_err(c, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i_bucket_gens *g = + bch2_trans_kmalloc(trans, sizeof(*g)); + + ret = PTR_ERR_OR_ZERO(g); + if (ret) + goto err; + + if (k.k->type == KEY_TYPE_bucket_gens) { + bkey_reassemble(&g->k_i, k); + } else { + bkey_bucket_gens_init(&g->k_i); + g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); + } + + g->v.gens[gens_offset] = a->gen; + + ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack +int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *freespace_iter) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + ca = bch_dev_bkey_exists(c, start.inode); + if (!ca->mi.freespace_initialized) + return 0; + + bch2_btree_iter_set_pos(freespace_iter, start); + + k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + + *end = bkey_min(k.k->p, *end); + + if (k.k->type != KEY_TYPE_set && + (c->opts.reconstruct_alloc || + fsck_err(c, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = KEY_TYPE_set; + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, + min_t(u64, U32_MAX, end->offset - + freespace_iter->pos.offset)); + + ret = bch2_trans_update(trans, freespace_iter, update, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack +int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *bucket_gens_iter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + unsigned i, gens_offset, gens_end_offset; + int ret; + + if (c->sb.version < bcachefs_metadata_version_bucket_gens) + return 0; + + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + + k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (bkey_cmp(alloc_gens_pos(start, &gens_offset), + alloc_gens_pos(*end, &gens_end_offset))) + gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; + + if (k.k->type == KEY_TYPE_bucket_gens) { + struct bkey_i_bucket_gens g; + bool need_update = false; + + bkey_reassemble(&g.k_i, k); + + for (i = gens_offset; i < gens_end_offset; i++) { + if (fsck_err_on(g.v.gens[i], c, + bucket_gens_hole_wrong, + "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", + bucket_gens_pos_to_alloc(k.k->p, i).inode, + bucket_gens_pos_to_alloc(k.k->p, i).offset, + g.v.gens[i])) { + g.v.gens[i] = 0; + need_update = true; + } + } + + if (need_update) { + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); + + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + memcpy(u, &g, sizeof(g)); + + ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); + if (ret) + goto err; + } + } + + *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; + struct bkey_s_c alloc_k; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + u64 genbits; + struct bpos pos; + enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard + ? BCH_DATA_need_discard + : BCH_DATA_free; + struct printbuf buf = PRINTBUF; + int ret; + + pos = iter->pos; + pos.offset &= ~(~0ULL << 56); + genbits = iter->pos.offset & (~0ULL << 56); + + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + ret = bkey_err(alloc_k); + if (ret) + return ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, + need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) + goto delete; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + + if (fsck_err_on(a->data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(*a)), c, + need_discard_freespace_key_bad, + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_id_str(iter->btree_id), + iter->pos.inode, + iter->pos.offset, + a->data_type == state, + genbits >> 56, alloc_freespace_genbits(*a) >> 56)) + goto delete; +out: +fsck_err: + set_btree_iter_dontneed(&alloc_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +delete: + ret = bch2_btree_delete_extent_at(trans, iter, + iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); + goto out; +} + +static int bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end) +{ + if (!btree_id_is_extents(iter->btree_id)) { + return __bch2_check_discard_freespace_key(trans, iter); + } else { + int ret = 0; + + while (!bkey_eq(iter->pos, end) && + !(ret = btree_trans_too_many_iters(trans) ?: + __bch2_check_discard_freespace_key(trans, iter))) + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + + return ret; + } +} + +/* + * We've already checked that generation numbers in the bucket_gens btree are + * valid for buckets that exist; this just checks for keys for nonexistent + * buckets. + */ +static noinline_for_stack +int bch2_check_bucket_gens_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_i_bucket_gens g; + struct bch_dev *ca; + u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + u64 b; + bool need_update = false, dev_exists; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(k.k->type != KEY_TYPE_bucket_gens); + bkey_reassemble(&g.k_i, k); + + /* if no bch_dev, skip out whether we repair or not */ + dev_exists = bch2_dev_exists2(c, k.k->p.inode); + if (!dev_exists) { + if (fsck_err_on(!dev_exists, c, + bucket_gens_to_invalid_dev, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + } + goto out; + } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + if (fsck_err_on(end <= ca->mi.first_bucket || + start >= ca->mi.nbuckets, c, + bucket_gens_to_invalid_buckets, + "bucket_gens key for invalid buckets:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto out; + } + + for (b = start; b < ca->mi.first_bucket; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + bucket_gens_nonzero_for_invalid_buckets, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; + } + + for (b = ca->mi.nbuckets; b < end; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + bucket_gens_nonzero_for_invalid_buckets, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; + } + + if (need_update) { + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); + + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto out; + + memcpy(u, &g, sizeof(g)); + ret = bch2_trans_update(trans, iter, u, 0); + } +out: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_alloc_info(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bkey hole; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH); + + while (1) { + struct bpos next; + + bch2_trans_begin(trans); + + k = bch2_get_key_or_real_bucket_hole(&iter, &hole); + ret = bkey_err(k); + if (ret) + goto bkey_err; + + if (!k.k) + break; + + if (k.k->type) { + next = bpos_nosnap_successor(k.k->p); + + ret = bch2_check_alloc_key(trans, + k, &iter, + &discard_iter, + &freespace_iter, + &bucket_gens_iter); + if (ret) + goto bkey_err; + } else { + next = k.k->p; + + ret = bch2_check_alloc_hole_freespace(trans, + bkey_start_pos(k.k), + &next, + &freespace_iter) ?: + bch2_check_alloc_hole_bucket_gens(trans, + bkey_start_pos(k.k), + &next, + &bucket_gens_iter); + if (ret) + goto bkey_err; + } + + ret = bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_pos(&iter, next); +bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + } + bch2_trans_iter_exit(trans, &bucket_gens_iter); + bch2_trans_iter_exit(trans, &freespace_iter); + bch2_trans_iter_exit(trans, &discard_iter); + bch2_trans_iter_exit(trans, &iter); + + if (ret < 0) + goto err; + + ret = for_each_btree_key2(trans, iter, + BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_PREFETCH, k, + bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: + for_each_btree_key2(trans, iter, + BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH, k, + bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: + for_each_btree_key_commit(trans, iter, + BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_bucket_gens_key(trans, &iter, k)); +err: + bch2_trans_put(trans); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, + struct btree_iter *alloc_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter lru_iter; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c alloc_k, lru_k; + struct printbuf buf = PRINTBUF; + int ret; + + alloc_k = bch2_btree_iter_peek(alloc_iter); + if (!alloc_k.k) + return 0; + + ret = bkey_err(alloc_k); + if (ret) + return ret; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + + if (a->data_type != BCH_DATA_cached) + return 0; + + lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ]), 0); + ret = bkey_err(lru_k); + if (ret) + return ret; + + if (fsck_err_on(!a->io_time[READ], c, + alloc_key_cached_but_read_time_zero, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || + fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + alloc_key_to_missing_lru_entry, + "missing lru entry\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + u64 read_time = a->io_time[READ] ?: + atomic64_read(&c->io_clock[READ].now); + + ret = bch2_lru_set(trans, + alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + read_time); + if (ret) + goto err; + + if (a->io_time[READ] != read_time) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + goto err; + + a_mut->v.io_time[READ] = read_time; + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} + +int bch2_check_alloc_to_lru_refs(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_alloc_to_lru_ref(trans, &iter))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int bch2_discard_one_bucket(struct btree_trans *trans, + struct btree_iter *need_discard_iter, + struct bpos *discard_pos_done, + u64 *seen, + u64 *open, + u64 *need_journal_commit, + u64 *discarded) +{ + struct bch_fs *c = trans->c; + struct bpos pos = need_discard_iter->pos; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + struct bch_dev *ca; + struct bkey_i_alloc_v4 *a; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { + bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); + return 0; + } + + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { + (*open)++; + goto out; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + pos.inode, pos.offset)) { + (*need_journal_commit)++; + goto out; + } + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + need_discard_iter->pos, + BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + goto out; + + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + + if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; + } + + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" + "%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + } + goto out; + } + + if (a->v.data_type != BCH_DATA_need_discard) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + } + + goto out; + } + + if (!bkey_eq(*discard_pos_done, iter.pos) && + ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + *discard_pos_done = iter.pos; + + ret = bch2_trans_relock_notrace(trans); + if (ret) + goto out; + } + + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + a->v.data_type = alloc_data_type(a->v, a->v.data_type); +write: + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); + if (ret) + goto out; + + this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); + (*discarded)++; +out: + (*seen)++; + bch2_trans_iter_exit(trans, &iter); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} + +static void bch2_do_discards_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct btree_iter iter; + struct bkey_s_c k; + u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct bpos discard_pos_done = POS_MAX; + int ret; + + /* + * We're doing the commit in bch2_discard_one_bucket instead of using + * for_each_btree_key_commit() so that we can increment counters after + * successful commit: + */ + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, + bch2_discard_one_bucket(trans, &iter, &discard_pos_done, + &seen, + &open, + &need_journal_commit, + &discarded))); + + if (need_journal_commit * 2 > seen) + bch2_journal_flush_async(&c->journal, NULL); + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); + + trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + bch2_err_str(ret)); +} + +void bch2_do_discards(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && + !queue_work(c->write_ref_wq, &c->discard_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +} + +static int invalidate_one_bucket(struct btree_trans *trans, + struct btree_iter *lru_iter, + struct bkey_s_c lru_k, + s64 *nr_to_invalidate) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bkey_i_alloc_v4 *a = NULL; + struct printbuf buf = PRINTBUF; + struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); + unsigned cached_sectors; + int ret = 0; + + if (*nr_to_invalidate <= 0) + return 1; + + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "lru entry points to invalid bucket"); + goto err; + } + + if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) + return 0; + + a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + + /* We expect harmless races here due to the btree write buffer: */ + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) + goto out; + + BUG_ON(a->v.data_type != BCH_DATA_cached); + + if (!a->v.cached_sectors) + bch_err(c, "invalidating empty bucket, confused"); + + cached_sectors = a->v.cached_sectors; + + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + a->v.gen++; + a->v.data_type = 0; + a->v.dirty_sectors = 0; + a->v.cached_sectors = 0; + a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + + ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, + BTREE_TRIGGER_BUCKET_INVALIDATE) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); + if (ret) + goto out; + + trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); + --*nr_to_invalidate; +out: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +err: + prt_str(&buf, "\n lru key: "); + bch2_bkey_val_to_text(&buf, c, lru_k); + + prt_str(&buf, "\n lru entry: "); + bch2_lru_pos_to_text(&buf, lru_iter->pos); + + prt_str(&buf, "\n alloc key: "); + if (!a) + bch2_bpos_to_text(&buf, bucket); + else + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + + bch_err(c, "%s", buf.buf); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { + bch2_inconsistent_error(c); + ret = -EINVAL; + } + + goto out; +} + +static void bch2_do_invalidates_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + unsigned i; + int ret = 0; + + ret = bch2_btree_write_buffer_flush(trans); + if (ret) + goto err; + + for_each_member_device(ca, c, i) { + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, 0), + lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), + BTREE_ITER_INTENT, k, + invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); + + if (ret < 0) { + percpu_ref_put(&ca->ref); + break; + } + } +err: + bch2_trans_put(trans); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +} + +void bch2_do_invalidates(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && + !queue_work(c->write_ref_wq, &c->invalidate_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +} + +int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + u64 bucket_start, u64 bucket_end) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bkey hole; + struct bpos end = POS(ca->dev_idx, bucket_end); + struct bch_member *m; + unsigned long last_updated = jiffies; + int ret; + + BUG_ON(bucket_start > bucket_end); + BUG_ON(bucket_end > ca->mi.nbuckets); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), + BTREE_ITER_PREFETCH); + /* + * Scan the alloc btree for every bucket on @ca, and add buckets to the + * freespace/need_discard/need_gc_gens btrees as needed: + */ + while (1) { + if (last_updated + HZ * 10 < jiffies) { + bch_info(ca, "%s: currently at %llu/%llu", + __func__, iter.pos.offset, ca->mi.nbuckets); + last_updated = jiffies; + } + + bch2_trans_begin(trans); + + if (bkey_ge(iter.pos, end)) { + ret = 0; + break; + } + + k = bch2_get_key_or_hole(&iter, end, &hole); + ret = bkey_err(k); + if (ret) + goto bkey_err; + + if (k.k->type) { + /* + * We process live keys in the alloc btree one at a + * time: + */ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + + ret = bch2_bucket_do_index(trans, k, a, true) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL); + if (ret) + goto bkey_err; + + bch2_btree_iter_advance(&iter); + } else { + struct bkey_i *freespace; + + freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); + ret = PTR_ERR_OR_ZERO(freespace); + if (ret) + goto bkey_err; + + bkey_init(&freespace->k); + freespace->k.type = KEY_TYPE_set; + freespace->k.p = k.k->p; + freespace->k.size = k.k->size; + + ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_pos(&iter, k.k->p); + } +bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + } + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + + if (ret < 0) { + bch_err_msg(ca, ret, "initializing free space"); + return ret; + } + + mutex_lock(&c->sb_lock); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_fs_freespace_init(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + int ret = 0; + bool doing_init = false; + + /* + * We can crash during the device add path, so we need to check this on + * every mount: + */ + + for_each_member_device(ca, c, i) { + if (ca->mi.freespace_initialized) + continue; + + if (!doing_init) { + bch_info(c, "initializing freespace"); + doing_init = true; + } + + ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + if (ret) { + percpu_ref_put(&ca->ref); + bch_err_fn(c, ret); + return ret; + } + } + + if (doing_init) { + mutex_lock(&c->sb_lock); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + bch_verbose(c, "done initializing freespace"); + } + + return 0; +} + +/* Bucket IO clocks: */ + +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + u64 now; + int ret = 0; + + a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + now = atomic64_read(&c->io_clock[rw].now); + if (a->v.io_time[rw] == now) + goto out; + + a->v.io_time[rw] = now; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* Startup/shutdown (ro/rw): */ + +void bch2_recalc_capacity(struct bch_fs *c) +{ + struct bch_dev *ca; + u64 capacity = 0, reserved_sectors = 0, gc_reserve; + unsigned bucket_size_max = 0; + unsigned long ra_pages = 0; + unsigned i; + + lockdep_assert_held(&c->state_lock); + + for_each_online_member(ca, c, i) { + struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; + + ra_pages += bdi->ra_pages; + } + + bch2_set_ra_pages(c, ra_pages); + + for_each_rw_member(ca, c, i) { + u64 dev_reserve = 0; + + /* + * We need to reserve buckets (from the number + * of currently available buckets) against + * foreground writes so that mainly copygc can + * make forward progress. + * + * We need enough to refill the various reserves + * from scratch - copygc will use its entire + * reserve all at once, then run against when + * its reserve is refilled (from the formerly + * available buckets). + * + * This reserve is just used when considering if + * allocations for foreground writes must wait - + * not -ENOSPC calculations. + */ + + dev_reserve += ca->nr_btree_reserve * 2; + dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ + + dev_reserve += 1; /* btree write point */ + dev_reserve += 1; /* copygc write point */ + dev_reserve += 1; /* rebalance write point */ + + dev_reserve *= ca->mi.bucket_size; + + capacity += bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); + + reserved_sectors += dev_reserve * 2; + + bucket_size_max = max_t(unsigned, bucket_size_max, + ca->mi.bucket_size); + } + + gc_reserve = c->opts.gc_reserve_bytes + ? c->opts.gc_reserve_bytes >> 9 + : div64_u64(capacity * c->opts.gc_reserve_percent, 100); + + reserved_sectors = max(gc_reserve, reserved_sectors); + + reserved_sectors = min(reserved_sectors, capacity); + + c->capacity = capacity - reserved_sectors; + + c->bucket_size_max = bucket_size_max; + + /* Wake up case someone was waiting for buckets */ + closure_wake_up(&c->freelist_wait); +} + +u64 bch2_min_rw_member_capacity(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + u64 ret = U64_MAX; + + for_each_rw_member(ca, c, i) + ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + return ret; +} + +static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) +{ + struct open_bucket *ob; + bool ret = false; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list && + ob->dev == ca->dev_idx) + ret = true; + spin_unlock(&ob->lock); + } + + return ret; +} + +/* device goes ro: */ +void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + /* First, remove device from allocation groups: */ + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + clear_bit(ca->dev_idx, c->rw_devs[i].d); + + /* + * Capacity is calculated based off of devices in allocation groups: + */ + bch2_recalc_capacity(c); + + bch2_open_buckets_stop(c, ca, false); + + /* + * Wake up threads that were blocked on allocation, so they can notice + * the device can no longer be removed and the capacity has changed: + */ + closure_wake_up(&c->freelist_wait); + + /* + * journal_res_get() can block waiting for free space in the journal - + * it needs to notice there may not be devices to allocate from anymore: + */ + wake_up(&c->journal.wait); + + /* Now wait for any in flight writes: */ + + closure_wait_event(&c->open_buckets_wait, + !bch2_dev_has_open_write_point(c, ca)); +} + +/* device goes rw: */ +void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (ca->mi.data_allowed & (1 << i)) + set_bit(ca->dev_idx, c->rw_devs[i].d); +} + +void bch2_fs_allocator_background_init(struct bch_fs *c) +{ + spin_lock_init(&c->freelist_lock); + INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 index 000000000000..73faf99a222a --- /dev/null +++ b/fs/bcachefs/alloc_background.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H +#define _BCACHEFS_ALLOC_BACKGROUND_H + +#include "bcachefs.h" +#include "alloc_types.h" +#include "buckets.h" +#include "debug.h" +#include "super.h" + +enum bkey_invalid_flags; + +/* How out of date a pointer gen is allowed to be: */ +#define BUCKET_GC_GEN_MAX 96U + +static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) +{ + struct bch_dev *ca; + + if (!bch2_dev_exists2(c, pos.inode)) + return false; + + ca = bch_dev_bkey_exists(c, pos.inode); + return pos.offset >= ca->mi.first_bucket && + pos.offset < ca->mi.nbuckets; +} + +static inline u64 bucket_to_u64(struct bpos bucket) +{ + return (bucket.inode << 48) | bucket.offset; +} + +static inline struct bpos u64_to_bucket(u64 bucket) +{ + return POS(bucket >> 48, bucket & ~(~0ULL << 48)); +} + +static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) +{ + return a.gen - a.oldest_gen; +} + +static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, + u32 cached_sectors, + u32 stripe, + struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + if (stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; + if (dirty_sectors) + return data_type; + if (cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BCH_DATA_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BCH_DATA_need_gc_gens; + return BCH_DATA_free; +} + +static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + return __alloc_data_type(a.dirty_sectors, a.cached_sectors, + a.stripe, a, data_type); +} + +static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) +{ + return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; +} + +static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) +{ + return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; +} + +#define DATA_TYPES_MOVABLE \ + ((1U << BCH_DATA_btree)| \ + (1U << BCH_DATA_user)| \ + (1U << BCH_DATA_stripe)) + +static inline bool data_type_movable(enum bch_data_type type) +{ + return (1U << type) & DATA_TYPES_MOVABLE; +} + +static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, + struct bch_dev *ca) +{ + if (!data_type_movable(a.data_type) || + a.dirty_sectors >= ca->mi.bucket_size) + return 0; + + return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); +} + +static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) +{ + return ((u64) alloc_gc_gen(a) >> 4) << 56; +} + +static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) +{ + pos.offset |= alloc_freespace_genbits(a); + return pos; +} + +static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +{ + unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + BCH_ALLOC_V4_U64s_V0) + + BCH_ALLOC_V4_NR_BACKPOINTERS(a) * + (sizeof(struct bch_backpointer) / sizeof(u64)); + + BUG_ON(ret > U8_MAX - BKEY_U64s); + return ret; +} + +static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) +{ + set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); +} + +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); + +void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); + +static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) +{ + const struct bch_alloc_v4 *ret; + + if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) + goto slowpath; + + ret = bkey_s_c_to_alloc_v4(k).v; + if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) + goto slowpath; + + return ret; +slowpath: + __bch2_alloc_to_v4(k, convert); + return convert; +} + +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); + +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + +int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_alloc_v4_swab(struct bkey_s); +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_alloc ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 8, \ +}) + +#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 8, \ +}) + +#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 16, \ +}) + +#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v4_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .swab = bch2_alloc_v4_swab, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 48, \ +}) + +int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ + .key_invalid = bch2_bucket_gens_invalid, \ + .val_to_text = bch2_bucket_gens_to_text, \ +}) + +int bch2_bucket_gens_init(struct bch_fs *); + +static inline bool bkey_is_alloc(const struct bkey *k) +{ + return k->type == KEY_TYPE_alloc || + k->type == KEY_TYPE_alloc_v2 || + k->type == KEY_TYPE_alloc_v3; +} + +int bch2_alloc_read(struct bch_fs *); + +int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_check_alloc_info(struct bch_fs *); +int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_do_discards(struct bch_fs *); + +static inline u64 should_invalidate_buckets(struct bch_dev *ca, + struct bch_dev_usage u) +{ + u64 want_free = ca->mi.nbuckets >> 7; + u64 free = max_t(s64, 0, + u.d[BCH_DATA_free].buckets + + u.d[BCH_DATA_need_discard].buckets + - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); + + return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); +} + +void bch2_do_invalidates(struct bch_fs *); + +static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) +{ + return (void *) ((u64 *) &a->v + + (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + BCH_ALLOC_V4_U64s_V0)); +} + +static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) +{ + return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); +} + +int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); +int bch2_fs_freespace_init(struct bch_fs *); + +void bch2_recalc_capacity(struct bch_fs *); +u64 bch2_min_rw_member_capacity(struct bch_fs *); + +void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); + +void bch2_fs_allocator_background_init(struct bch_fs *); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 index 000000000000..0e6157982607 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c @@ -0,0 +1,1638 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. + * + * Foreground allocator code: allocate buckets from freelist, and allocate in + * sector granularity from writepoints. + * + * bch2_bucket_alloc() allocates a single bucket from a specific device. + * + * bch2_bucket_alloc_set() allocates one or more buckets from different devices + * in a given filesystem. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_gc.h" +#include "buckets.h" +#include "buckets_waiting_for_journal.h" +#include "clock.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io_write.h" +#include "journal.h" +#include "movinggc.h" +#include "nocow_locking.h" +#include "trace.h" + +#include <linux/math64.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> + +static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, + struct mutex *lock) +{ + if (!mutex_trylock(lock)) { + bch2_trans_unlock(trans); + mutex_lock(lock); + } +} + +const char * const bch2_watermarks[] = { +#define x(t) #t, + BCH_WATERMARKS() +#undef x + NULL +}; + +/* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: + * + * - They track buckets that have been partially allocated, allowing for + * sub-bucket sized allocations - they're used by the sector allocator below + * + * - They provide a reference to the buckets they own that mark and sweep GC + * can find, until the new allocation has a pointer to it inserted into the + * btree + * + * When allocating some space with the sector allocator, the allocation comes + * with a reference to an open bucket - the caller is required to put that + * reference _after_ doing the index update that makes its allocation reachable. + */ + +void bch2_reset_alloc_cursors(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + ca->alloc_cursor = 0; + rcu_read_unlock(); +} + +static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + ob->hash = *slot; + *slot = idx; +} + +static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + while (*slot != idx) { + BUG_ON(!*slot); + slot = &c->open_buckets[*slot].hash; + } + + *slot = ob->hash; + ob->hash = 0; +} + +void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (ob->ec) { + ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); + return; + } + + percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); + + ob->valid = false; + ob->data_type = 0; + + spin_unlock(&ob->lock); + percpu_up_read(&c->mark_lock); + + spin_lock(&c->freelist_lock); + bch2_open_bucket_hash_remove(c, ob); + + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + + c->open_buckets_nr_free++; + ca->nr_open_buckets--; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); +} + +void bch2_open_bucket_write_error(struct bch_fs *c, + struct open_buckets *obs, + unsigned dev) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->dev == dev && ob->ec) + bch2_ec_bucket_cancel(c, ob); +} + +static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) +{ + struct open_bucket *ob; + + BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); + + ob = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ob->freelist; + atomic_set(&ob->pin, 1); + ob->data_type = 0; + + c->open_buckets_nr_free--; + return ob; +} + +static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) +{ + BUG_ON(c->open_buckets_partial_nr >= + ARRAY_SIZE(c->open_buckets_partial)); + + spin_lock(&c->freelist_lock); + ob->on_partial_list = true; + c->open_buckets_partial[c->open_buckets_partial_nr++] = + ob - c->open_buckets; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); +} + +/* _only_ for allocating the journal on a new device: */ +long bch2_bucket_alloc_new_fs(struct bch_dev *ca) +{ + while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { + u64 b = ca->new_fs_bucket_idx++; + + if (!is_superblock_bucket(ca, b) && + (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) + return b; + } + + return -1; +} + +static inline unsigned open_buckets_reserved(enum bch_watermark watermark) +{ + switch (watermark) { + case BCH_WATERMARK_reclaim: + return 0; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 bucket, + enum bch_watermark watermark, + const struct bch_alloc_v4 *a, + struct bucket_alloc_state *s, + struct closure *cl) +{ + struct open_bucket *ob; + + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + s->skipped_nouse++; + return NULL; + } + + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + s->skipped_open++; + return NULL; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + s->skipped_need_journal_commit++; + return NULL; + } + + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { + s->skipped_nocow++; + return NULL; + } + + spin_lock(&c->freelist_lock); + + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + + if (!c->blocked_allocate_open_bucket) + c->blocked_allocate_open_bucket = local_clock(); + + spin_unlock(&c->freelist_lock); + return ERR_PTR(-BCH_ERR_open_buckets_empty); + } + + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + spin_unlock(&c->freelist_lock); + s->skipped_open++; + return NULL; + } + + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); + + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->dev = ca->dev_idx; + ob->gen = a->gen; + ob->bucket = bucket; + spin_unlock(&ob->lock); + + ca->nr_open_buckets++; + bch2_open_bucket_hash_add(c, ob); + + if (c->blocked_allocate_open_bucket) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate_open_bucket], + c->blocked_allocate_open_bucket); + c->blocked_allocate_open_bucket = 0; + } + + if (c->blocked_allocate) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate], + c->blocked_allocate); + c->blocked_allocate = 0; + } + + spin_unlock(&c->freelist_lock); + return ob; +} + +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, + enum bch_watermark watermark, u64 free_entry, + struct bucket_alloc_state *s, + struct bkey_s_c freespace_k, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + struct open_bucket *ob; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + u64 b = free_entry & ~(~0ULL << 56); + unsigned genbits = free_entry >> 56; + struct printbuf buf = PRINTBUF; + int ret; + + if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { + prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" + " freespace key ", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_alloc, POS(ca->dev_idx, b), + BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + a = bch2_alloc_to_v4(k, &a_convert); + + if (a->data_type != BCH_DATA_free) { + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + ob = NULL; + goto err; + } + + prt_printf(&buf, "non free bucket in freespace btree\n" + " freespace key "); + bch2_bkey_val_to_text(&buf, c, freespace_k); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + if (genbits != (alloc_freespace_genbits(*a) >> 56) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " freespace key ", + genbits, alloc_freespace_genbits(*a) >> 56); + bch2_bkey_val_to_text(&buf, c, freespace_k); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { + struct bch_backpointer bp; + struct bpos bp_pos = POS_MIN; + + ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, + &bp_pos, &bp, + BTREE_ITER_NOPRESERVE); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + if (!bkey_eq(bp_pos, POS_MAX)) { + /* + * Bucket may have data in it - we don't call + * bc2h_trans_inconnsistent() because fsck hasn't + * finished yet + */ + ob = NULL; + goto err; + } + } + + ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); + if (!ob) + iter.path->preserve = false; +err: + if (iter.trans && iter.path) + set_btree_iter_dontneed(&iter); + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ob; +} + +/* + * This path is for before the freespace btree is initialized: + * + * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ +static noinline struct open_bucket * +bch2_bucket_alloc_early(struct btree_trans *trans, + struct bch_dev *ca, + enum bch_watermark watermark, + struct bucket_alloc_state *s, + struct closure *cl) +{ + struct btree_iter iter, citer; + struct bkey_s_c k, ck; + struct open_bucket *ob = NULL; + u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; + int ret; + + /* + * Scan with an uncached iterator to avoid polluting the key cache. An + * uncached iter will return a cached key if one exists, but if not + * there is no other underlying protection for the associated key cache + * slot. To avoid racing bucket allocations, look up the cached key slot + * of any likely allocation candidate before attempting to proceed with + * the allocation. This provides proper exclusion on the associated + * bucket. + */ +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), + BTREE_ITER_SLOTS, k, ret) { + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + + if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) + break; + + if (ca->new_fs_bucket_idx && + is_superblock_bucket(ca, k.k->p.offset)) + continue; + + a = bch2_alloc_to_v4(k, &a_convert); + if (a->data_type != BCH_DATA_free) + continue; + + /* now check the cached key to serialize concurrent allocs of the bucket */ + ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); + ret = bkey_err(ck); + if (ret) + break; + + a = bch2_alloc_to_v4(ck, &a_convert); + if (a->data_type != BCH_DATA_free) + goto next; + + s->buckets_seen++; + + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); +next: + citer.path->preserve = false; + bch2_trans_iter_exit(trans, &citer); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + alloc_cursor = iter.pos.offset; + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_start > first_bucket) { + alloc_cursor = alloc_start = first_bucket; + goto again; + } + + return ob; +} + +static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + struct bch_dev *ca, + enum bch_watermark watermark, + struct bucket_alloc_state *s, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; + int ret; + + BUG_ON(ca->new_fs_bucket_idx); +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, alloc_cursor), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + + for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); + alloc_cursor < k.k->p.offset; + alloc_cursor++) { + ret = btree_trans_too_many_iters(trans); + if (ret) { + ob = ERR_PTR(ret); + break; + } + + s->buckets_seen++; + + ob = try_alloc_bucket(trans, ca, watermark, + alloc_cursor, s, k, cl); + if (ob) { + iter.path->preserve = false; + break; + } + } + + if (ob || ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_start > ca->mi.first_bucket) { + alloc_cursor = alloc_start = ca->mi.first_bucket; + goto again; + } + + return ob; +} + +/** + * bch2_bucket_alloc_trans - allocate a single bucket from a specific device + * @trans: transaction object + * @ca: device to allocate from + * @watermark: how important is this allocation? + * @cl: if not NULL, closure to be used to wait if buckets not available + * @usage: for secondarily also returning the current device usage + * + * Returns: an open_bucket on success, or an ERR_PTR() on failure. + */ +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum bch_watermark watermark, + struct closure *cl, + struct bch_dev_usage *usage) +{ + struct bch_fs *c = trans->c; + struct open_bucket *ob = NULL; + bool freespace = READ_ONCE(ca->mi.freespace_initialized); + u64 avail; + struct bucket_alloc_state s = { 0 }; + bool waiting = false; +again: + bch2_dev_usage_read_fast(ca, usage); + avail = dev_buckets_free(ca, *usage, watermark); + + if (usage->d[BCH_DATA_need_discard].buckets > avail) + bch2_do_discards(c); + + if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) + bch2_do_gc_gens(c); + + if (should_invalidate_buckets(ca, *usage)) + bch2_do_invalidates(c); + + if (!avail) { + if (cl && !waiting) { + closure_wait(&c->freelist_wait, cl); + waiting = true; + goto again; + } + + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + + ob = ERR_PTR(-BCH_ERR_freelist_empty); + goto err; + } + + if (waiting) + closure_wake_up(&c->freelist_wait); +alloc: + ob = likely(freespace) + ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) + : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + + if (s.skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); + + if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + freespace = false; + goto alloc; + } +err: + if (!ob) + ob = ERR_PTR(-BCH_ERR_no_buckets_found); + + if (!IS_ERR(ob)) + trace_and_count(c, bucket_alloc, ca, + bch2_watermarks[watermark], + ob->bucket, + usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + &s, + cl == NULL, + ""); + else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) + trace_and_count(c, bucket_alloc_fail, ca, + bch2_watermarks[watermark], + 0, + usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + &s, + cl == NULL, + bch2_err_str(PTR_ERR(ob))); + + return ob; +} + +struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum bch_watermark watermark, + struct closure *cl) +{ + struct bch_dev_usage usage; + struct open_bucket *ob; + + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, + cl, &usage))); + return ob; +} + +static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) +{ + return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - + (stripe->next_alloc[l] < stripe->next_alloc[r])); +} + +#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) + +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs) +{ + struct dev_alloc_list ret = { .nr = 0 }; + unsigned i; + + for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) + ret.devs[ret.nr++] = i; + + bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + return ret; +} + +static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, + struct dev_stripe_state *stripe, + struct bch_dev_usage *usage) +{ + u64 *v = stripe->next_alloc + ca->dev_idx; + u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; + u64 scale = *v / 4; + + if (*v + free_space_inv >= *v) + *v += free_space_inv; + else + *v = U64_MAX; + + for (v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) + *v = *v < scale ? 0 : *v - scale; +} + +void bch2_dev_stripe_increment(struct bch_dev *ca, + struct dev_stripe_state *stripe) +{ + struct bch_dev_usage usage; + + bch2_dev_usage_read_fast(ca, &usage); + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); +} + +static int add_new_bucket(struct bch_fs *c, + struct open_buckets *ptrs, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + struct open_bucket *ob) +{ + unsigned durability = + bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); + + __clear_bit(ob->dev, devs_may_alloc->d); + *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + ? durability : 1; + *have_cache |= !durability; + + ob_push(c, ptrs, ob); + + if (*nr_effective >= nr_replicas) + return 1; + if (ob->ec) + return 1; + return 0; +} + +int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + enum bch_data_type data_type, + enum bch_watermark watermark, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); + unsigned dev; + struct bch_dev *ca; + int ret = -BCH_ERR_insufficient_devices; + unsigned i; + + BUG_ON(*nr_effective >= nr_replicas); + + for (i = 0; i < devs_sorted.nr; i++) { + struct bch_dev_usage usage; + struct open_bucket *ob; + + dev = devs_sorted.devs[i]; + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + if (!ca) + continue; + + if (!ca->mi.durability && *have_cache) { + percpu_ref_put(&ca->ref); + continue; + } + + ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); + percpu_ref_put(&ca->ref); + + if (IS_ERR(ob)) { + ret = PTR_ERR(ob); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) + break; + continue; + } + + ob->data_type = data_type; + + if (add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob)) { + ret = 0; + break; + } + } + + return ret; +} + +/* Allocate from stripes: */ + +/* + * if we can't allocate a new stripe because there are already too many + * partially filled stripes, force allocating from an existing stripe even when + * it's to a device we don't want: + */ + +static int bucket_alloc_from_stripe(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct dev_alloc_list devs_sorted; + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i, ec_idx; + int ret = 0; + + if (nr_replicas < 2) + return 0; + + if (ec_open_bucket(c, ptrs)) + return 0; + + h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + if (IS_ERR(h)) + return PTR_ERR(h); + if (!h) + return 0; + + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + + for (i = 0; i < devs_sorted.nr; i++) + for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + if (!h->s->blocks[ec_idx]) + continue; + + ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + } + goto out_put_head; +got_bucket: + ob->ec_idx = ec_idx; + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); +out_put_head: + bch2_ec_stripe_head_put(c, h); + return ret; +} + +/* Sector allocator */ + +static bool want_bucket(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + bool *have_cache, bool ec, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (!test_bit(ob->dev, devs_may_alloc->d)) + return false; + + if (ob->data_type != wp->data_type) + return false; + + if (!ca->mi.durability && + (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + return false; + + if (ec != (ob->ec != NULL)) + return false; + + return true; +} + +static int bucket_alloc_set_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + bool ec, unsigned flags) +{ + struct open_buckets ptrs_skip = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + int ret = 0; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + if (!ret && want_bucket(c, wp, devs_may_alloc, + have_cache, ec, ob)) + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + else + ob_push(c, &ptrs_skip, ob); + } + wp->ptrs = ptrs_skip; + + return ret; +} + +static int bucket_alloc_set_partial(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, bool ec, + enum bch_watermark watermark, + unsigned flags) +{ + int i, ret = 0; + + if (!c->open_buckets_partial_nr) + return 0; + + spin_lock(&c->freelist_lock); + + if (!c->open_buckets_partial_nr) + goto unlock; + + for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { + struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; + + if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev_usage usage; + u64 avail; + + bch2_dev_usage_read_fast(ca, &usage); + avail = dev_buckets_free(ca, usage, watermark); + if (!avail) + continue; + + array_remove_item(c->open_buckets_partial, + c->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + if (ret) + break; + } + } +unlock: + spin_unlock(&c->freelist_lock); + return ret; +} + +static int __open_bucket_add_buckets(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + bool erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum bch_watermark watermark, + unsigned flags, + struct closure *_cl) +{ + struct bch_fs *c = trans->c; + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; + unsigned i; + int ret; + + devs = target_rw_devs(c, wp->data_type, target); + + /* Don't allocate from devices we already have pointers to: */ + for (i = 0; i < devs_have->nr; i++) + __clear_bit(devs_have->devs[i], devs.d); + + open_bucket_for_each(c, ptrs, ob, i) + __clear_bit(ob->dev, devs.d); + + if (erasure_code && ec_open_bucket(c, ptrs)) + return 0; + + ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, flags); + if (ret) + return ret; + + ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, watermark, flags); + if (ret) + return ret; + + if (erasure_code) { + ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, + target, + nr_replicas, nr_effective, + have_cache, + watermark, flags, _cl); + } else { +retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ + ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + flags, wp->data_type, watermark, cl); + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && + !cl && _cl) { + cl = _cl; + goto retry_blocking; + } + } + + return ret; +} + +static int open_bucket_add_buckets(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl) +{ + int ret; + + if (erasure_code) { + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, erasure_code, + nr_replicas, nr_effective, have_cache, + watermark, flags, cl); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_operation_blocked) || + bch2_err_matches(ret, BCH_ERR_freelist_empty) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + return ret; + if (*nr_effective >= nr_replicas) + return 0; + } + + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, false, + nr_replicas, nr_effective, have_cache, + watermark, flags, cl); + return ret < 0 ? ret : 0; +} + +/** + * should_drop_bucket - check if this is open_bucket should go away + * @ob: open_bucket to predicate on + * @c: filesystem handle + * @ca: if set, we're killing buckets for a particular device + * @ec: if true, we're shutting down erasure coding and killing all ec + * open_buckets + * otherwise, return true + * Returns: true if we should kill this open_bucket + * + * We're killing open_buckets because we're shutting down a device, erasure + * coding, or the entire filesystem - check if this open_bucket matches: + */ +static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, + struct bch_dev *ca, bool ec) +{ + if (ec) { + return ob->ec != NULL; + } else if (ca) { + bool drop = ob->dev == ca->dev_idx; + struct open_bucket *ob2; + unsigned i; + + if (!drop && ob->ec) { + unsigned nr_blocks; + + mutex_lock(&ob->ec->lock); + nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; + + for (i = 0; i < nr_blocks; i++) { + if (!ob->ec->blocks[i]) + continue; + + ob2 = c->open_buckets + ob->ec->blocks[i]; + drop |= ob2->dev == ca->dev_idx; + } + mutex_unlock(&ob->ec->lock); + } + + return drop; + } else { + return true; + } +} + +static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec, struct write_point *wp) +{ + struct open_buckets ptrs = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (should_drop_bucket(ob, c, ca, ec)) + bch2_open_bucket_put(c, ob); + else + ob_push(c, &ptrs, ob); + wp->ptrs = ptrs; + mutex_unlock(&wp->lock); +} + +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec) +{ + unsigned i; + + /* Next, close write points that point to this device... */ + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); + + bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); + bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); + bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); + + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + bch2_open_buckets_put(c, &a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + + spin_lock(&c->freelist_lock); + i = 0; + while (i < c->open_buckets_partial_nr) { + struct open_bucket *ob = + c->open_buckets + c->open_buckets_partial[i]; + + if (should_drop_bucket(ob, c, ca, ec)) { + --c->open_buckets_partial_nr; + swap(c->open_buckets_partial[i], + c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + bch2_open_bucket_put(c, ob); + spin_lock(&c->freelist_lock); + } else { + i++; + } + } + spin_unlock(&c->freelist_lock); + + bch2_ec_stop_dev(c, ca); +} + +static inline struct hlist_head *writepoint_hash(struct bch_fs *c, + unsigned long write_point) +{ + unsigned hash = + hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); + + return &c->write_points_hash[hash]; +} + +static struct write_point *__writepoint_find(struct hlist_head *head, + unsigned long write_point) +{ + struct write_point *wp; + + rcu_read_lock(); + hlist_for_each_entry_rcu(wp, head, node) + if (wp->write_point == write_point) + goto out; + wp = NULL; +out: + rcu_read_unlock(); + return wp; +} + +static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) +{ + u64 stranded = c->write_points_nr * c->bucket_size_max; + u64 free = bch2_fs_usage_read_short(c).free; + + return stranded * factor > free; +} + +static bool try_increase_writepoints(struct bch_fs *c) +{ + struct write_point *wp; + + if (c->write_points_nr == ARRAY_SIZE(c->write_points) || + too_many_writepoints(c, 32)) + return false; + + wp = c->write_points + c->write_points_nr++; + hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); + return true; +} + +static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) +{ + struct bch_fs *c = trans->c; + struct write_point *wp; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&c->write_points_hash_lock); + if (c->write_points_nr < old_nr) { + mutex_unlock(&c->write_points_hash_lock); + return true; + } + + if (c->write_points_nr == 1 || + !too_many_writepoints(c, 8)) { + mutex_unlock(&c->write_points_hash_lock); + return false; + } + + wp = c->write_points + --c->write_points_nr; + + hlist_del_rcu(&wp->node); + mutex_unlock(&c->write_points_hash_lock); + + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_free_unused(c, ob); + wp->ptrs.nr = 0; + mutex_unlock(&wp->lock); + return true; +} + +static struct write_point *writepoint_find(struct btree_trans *trans, + unsigned long write_point) +{ + struct bch_fs *c = trans->c; + struct write_point *wp, *oldest; + struct hlist_head *head; + + if (!(write_point & 1UL)) { + wp = (struct write_point *) write_point; + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + return wp; + } + + head = writepoint_hash(c, write_point); +restart_find: + wp = __writepoint_find(head, write_point); + if (wp) { +lock_wp: + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + if (wp->write_point == write_point) + goto out; + mutex_unlock(&wp->lock); + goto restart_find; + } +restart_find_oldest: + oldest = NULL; + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) + if (!oldest || time_before64(wp->last_used, oldest->last_used)) + oldest = wp; + + bch2_trans_mutex_lock_norelock(trans, &oldest->lock); + bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); + if (oldest >= c->write_points + c->write_points_nr || + try_increase_writepoints(c)) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto restart_find_oldest; + } + + wp = __writepoint_find(head, write_point); + if (wp && wp != oldest) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto lock_wp; + } + + wp = oldest; + hlist_del_rcu(&wp->node); + wp->write_point = write_point; + hlist_add_head_rcu(&wp->node, head); + mutex_unlock(&c->write_points_hash_lock); +out: + wp->last_used = local_clock(); + return wp; +} + +static noinline void +deallocate_extra_replicas(struct bch_fs *c, + struct open_buckets *ptrs, + struct open_buckets *ptrs_no_use, + unsigned extra_replicas) +{ + struct open_buckets ptrs2 = { 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) { + unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + if (d && d <= extra_replicas) { + extra_replicas -= d; + ob_push(c, ptrs_no_use, ob); + } else { + ob_push(c, &ptrs2, ob); + } + } + + *ptrs = ptrs2; +} + +/* + * Get us an open_bucket we can allocate from, return with it locked: + */ +int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl, + struct write_point **wp_ret) +{ + struct bch_fs *c = trans->c; + struct write_point *wp; + struct open_bucket *ob; + struct open_buckets ptrs; + unsigned nr_effective, write_points_nr; + bool have_cache; + int ret; + int i; + + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) + erasure_code = false; + + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); + + BUG_ON(!nr_replicas || !nr_replicas_required); +retry: + ptrs.nr = 0; + nr_effective = 0; + write_points_nr = c->write_points_nr; + have_cache = false; + + *wp_ret = wp = writepoint_find(trans, write_point.v); + + /* metadata may not allocate on cache devices: */ + if (wp->data_type != BCH_DATA_user) + have_cache = true; + + if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, NULL); + if (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto alloc_done; + + /* Don't retry from all devices if we're out of open buckets: */ + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { + int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); + if (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + goto alloc_done; + } + + /* + * Only try to allocate cache (durability = 0 devices) from the + * specified target: + */ + have_cache = true; + + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + 0, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); + } else { + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); + } +alloc_done: + BUG_ON(!ret && nr_effective < nr_replicas); + + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + + if (ret == -BCH_ERR_insufficient_devices && + nr_effective >= nr_replicas_required) + ret = 0; + + if (ret) + goto err; + + if (nr_effective > nr_replicas) + deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + + /* Free buckets we didn't use: */ + open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_free_unused(c, ob); + + wp->ptrs = ptrs; + + wp->sectors_free = UINT_MAX; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + + return 0; +err: + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (ptrs.nr < ARRAY_SIZE(ptrs.v)) + ob_push(c, &ptrs, ob); + else + open_bucket_free_unused(c, ob); + wp->ptrs = ptrs; + + mutex_unlock(&wp->lock); + + if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && + try_decrease_writepoints(trans, write_points_nr)) + goto retry; + + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || + bch2_err_matches(ret, BCH_ERR_freelist_empty)) + return cl + ? -BCH_ERR_bucket_alloc_blocked + : -BCH_ERR_ENOSPC_bucket_alloc; + + return ret; +} + +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} + +void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + struct bkey_i *k, unsigned sectors, + bool cached) +{ + bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); +} + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) +{ + bch2_alloc_sectors_done_inlined(c, wp); +} + +static inline void writepoint_init(struct write_point *wp, + enum bch_data_type type) +{ + mutex_init(&wp->lock); + wp->data_type = type; + + INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); + INIT_LIST_HEAD(&wp->writes); + spin_lock_init(&wp->writes_lock); +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *c) +{ + struct open_bucket *ob; + struct write_point *wp; + + mutex_init(&c->write_points_hash_lock); + c->write_points_nr = ARRAY_SIZE(c->write_points); + + /* open bucket 0 is a sentinal NULL: */ + spin_lock_init(&c->open_buckets[0].lock); + + for (ob = c->open_buckets + 1; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { + spin_lock_init(&ob->lock); + c->open_buckets_nr_free++; + + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + } + + writepoint_init(&c->btree_write_point, BCH_DATA_btree); + writepoint_init(&c->rebalance_write_point, BCH_DATA_user); + writepoint_init(&c->copygc_write_point, BCH_DATA_user); + + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) { + writepoint_init(wp, BCH_DATA_user); + + wp->last_used = local_clock(); + wp->write_point = (unsigned long) wp; + hlist_add_head_rcu(&wp->node, + writepoint_hash(c, wp->write_point)); + } +} + +static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + unsigned data_type = ob->data_type; + barrier(); /* READ_ONCE() doesn't work on bitfields */ + + prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + ob - c->open_buckets, + atomic_read(&ob->pin), + data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + ob->dev, ob->bucket, ob->gen, + ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); + if (ob->ec) + prt_printf(out, " ec idx %llu", ob->ec->idx); + if (ob->on_partial_list) + prt_str(out, " partial"); + prt_newline(out); +} + +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct open_bucket *ob; + + out->atomic++; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list) + bch2_open_bucket_to_text(out, c, ob); + spin_unlock(&ob->lock); + } + + --out->atomic; +} + +void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned i; + + out->atomic++; + spin_lock(&c->freelist_lock); + + for (i = 0; i < c->open_buckets_partial_nr; i++) + bch2_open_bucket_to_text(out, c, + c->open_buckets + c->open_buckets_partial[i]); + + spin_unlock(&c->freelist_lock); + --out->atomic; +} + +static const char * const bch2_write_point_states[] = { +#define x(n) #n, + WRITE_POINT_STATES() +#undef x + NULL +}; + +static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, + struct write_point *wp) +{ + struct open_bucket *ob; + unsigned i; + + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated); + + prt_printf(out, " last wrote: "); + bch2_pr_time_units(out, sched_clock() - wp->last_used); + + for (i = 0; i < WRITE_POINT_STATE_NR; i++) { + prt_printf(out, " %s: ", bch2_write_point_states[i]); + bch2_pr_time_units(out, wp->time[i]); + } + + prt_newline(out); + + printbuf_indent_add(out, 2); + open_bucket_for_each(c, &wp->ptrs, ob, i) + bch2_open_bucket_to_text(out, c, ob); + printbuf_indent_sub(out, 2); +} + +void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct write_point *wp; + + prt_str(out, "Foreground write points\n"); + for (wp = c->write_points; + wp < c->write_points + ARRAY_SIZE(c->write_points); + wp++) + bch2_write_point_to_text(out, c, wp); + + prt_str(out, "Copygc write point\n"); + bch2_write_point_to_text(out, c, &c->copygc_write_point); + + prt_str(out, "Rebalance write point\n"); + bch2_write_point_to_text(out, c, &c->rebalance_write_point); + + prt_str(out, "Btree write point\n"); + bch2_write_point_to_text(out, c, &c->btree_write_point); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h new file mode 100644 index 000000000000..7aaeec44c746 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H +#define _BCACHEFS_ALLOC_FOREGROUND_H + +#include "bcachefs.h" +#include "alloc_types.h" +#include "extents.h" +#include "sb-members.h" + +#include <linux/hash.h> + +struct bkey; +struct bch_dev; +struct bch_fs; +struct bch_devs_List; + +extern const char * const bch2_watermarks[]; + +void bch2_reset_alloc_cursors(struct bch_fs *); + +struct dev_alloc_list { + unsigned nr; + u8 devs[BCH_SB_MEMBERS_MAX]; +}; + +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *); +void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); + +long bch2_bucket_alloc_new_fs(struct bch_dev *); + +struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, + enum bch_watermark, struct closure *); + +static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, + struct open_bucket *ob) +{ + BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); + + obs->v[obs->nr++] = ob - c->open_buckets; +} + +#define open_bucket_for_each(_c, _obs, _ob, _i) \ + for ((_i) = 0; \ + (_i) < (_obs)->nr && \ + ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ + (_i)++) + +static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, + struct open_buckets *obs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ec) + return ob; + + return NULL; +} + +void bch2_open_bucket_write_error(struct bch_fs *, + struct open_buckets *, unsigned); + +void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + +static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + if (atomic_dec_and_test(&ob->pin)) + __bch2_open_bucket_put(c, ob); +} + +static inline void bch2_open_buckets_put(struct bch_fs *c, + struct open_buckets *ptrs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) + bch2_open_bucket_put(c, ob); + ptrs->nr = 0; +} + +static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp) +{ + struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); + wp->ptrs = keep; + + mutex_unlock(&wp->lock); + + bch2_open_buckets_put(c, &ptrs); +} + +static inline void bch2_open_bucket_get(struct bch_fs *c, + struct write_point *wp, + struct open_buckets *ptrs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + ob->data_type = wp->data_type; + atomic_inc(&ob->pin); + ob_push(c, ptrs, ob); + } +} + +static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, + unsigned dev, u64 bucket) +{ + return c->open_buckets_hash + + (jhash_3words(dev, bucket, bucket >> 32, 0) & + (OPEN_BUCKETS_COUNT - 1)); +} + +static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) +{ + open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); + + while (slot) { + struct open_bucket *ob = &c->open_buckets[slot]; + + if (ob->dev == dev && ob->bucket == bucket) + return true; + + slot = ob->hash; + } + + return false; +} + +static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) +{ + bool ret; + + if (bch2_bucket_is_open(c, dev, bucket)) + return true; + + spin_lock(&c->freelist_lock); + ret = bch2_bucket_is_open(c, dev, bucket); + spin_unlock(&c->freelist_lock); + + return ret; +} + +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, unsigned, + enum bch_data_type, enum bch_watermark, + struct closure *); + +int bch2_alloc_sectors_start_trans(struct btree_trans *, + unsigned, unsigned, + struct write_point_specifier, + struct bch_devs_list *, + unsigned, unsigned, + enum bch_watermark, + unsigned, + struct closure *, + struct write_point **); + +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +static inline void +bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, + struct bkey_i *k, unsigned sectors, + bool cached) +{ + struct open_bucket *ob; + unsigned i; + + BUG_ON(sectors > wp->sectors_free); + wp->sectors_free -= sectors; + wp->sectors_allocated += sectors; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); + + ptr.cached = cached || + (!ca->mi.durability && + wp->data_type == BCH_DATA_user); + + bch2_bkey_append_ptr(k, ptr); + + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; + } +} + +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, + struct bkey_i *, unsigned, bool); +void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); + +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); + +static inline struct write_point_specifier writepoint_hashed(unsigned long v) +{ + return (struct write_point_specifier) { .v = v | 1 }; +} + +static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) +{ + return (struct write_point_specifier) { .v = (unsigned long) wp }; +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *); + +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); + +void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h new file mode 100644 index 000000000000..b91b7a461056 --- /dev/null +++ b/fs/bcachefs/alloc_types.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_TYPES_H +#define _BCACHEFS_ALLOC_TYPES_H + +#include <linux/mutex.h> +#include <linux/spinlock.h> + +#include "clock_types.h" +#include "fifo.h" + +struct bucket_alloc_state { + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 skipped_nocow; + u64 skipped_nouse; +}; + +#define BCH_WATERMARKS() \ + x(stripe) \ + x(normal) \ + x(copygc) \ + x(btree) \ + x(btree_copygc) \ + x(reclaim) + +enum bch_watermark { +#define x(name) BCH_WATERMARK_##name, + BCH_WATERMARKS() +#undef x + BCH_WATERMARK_NR, +}; + +#define BCH_WATERMARK_BITS 3 +#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS) + +#define OPEN_BUCKETS_COUNT 1024 + +#define WRITE_POINT_HASH_NR 32 +#define WRITE_POINT_MAX 32 + +/* + * 0 is never a valid open_bucket_idx_t: + */ +typedef u16 open_bucket_idx_t; + +struct open_bucket { + spinlock_t lock; + atomic_t pin; + open_bucket_idx_t freelist; + open_bucket_idx_t hash; + + /* + * When an open bucket has an ec_stripe attached, this is the index of + * the block in the stripe this open_bucket corresponds to: + */ + u8 ec_idx; + enum bch_data_type data_type:6; + unsigned valid:1; + unsigned on_partial_list:1; + + u8 dev; + u8 gen; + u32 sectors_free; + u64 bucket; + struct ec_stripe_new *ec; +}; + +#define OPEN_BUCKET_LIST_MAX 15 + +struct open_buckets { + open_bucket_idx_t nr; + open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; +}; + +struct dev_stripe_state { + u64 next_alloc[BCH_SB_MEMBERS_MAX]; +}; + +#define WRITE_POINT_STATES() \ + x(stopped) \ + x(waiting_io) \ + x(waiting_work) \ + x(running) + +enum write_point_state { +#define x(n) WRITE_POINT_##n, + WRITE_POINT_STATES() +#undef x + WRITE_POINT_STATE_NR +}; + +struct write_point { + struct { + struct hlist_node node; + struct mutex lock; + u64 last_used; + unsigned long write_point; + enum bch_data_type data_type; + + /* calculated based on how many pointers we're actually going to use: */ + unsigned sectors_free; + + struct open_buckets ptrs; + struct dev_stripe_state stripe; + + u64 sectors_allocated; + } __aligned(SMP_CACHE_BYTES); + + struct { + struct work_struct index_update_work; + + struct list_head writes; + spinlock_t writes_lock; + + enum write_point_state state; + u64 last_state_change; + u64 time[WRITE_POINT_STATE_NR]; + } __aligned(SMP_CACHE_BYTES); +}; + +struct write_point_specifier { + unsigned long v; +}; + +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c new file mode 100644 index 000000000000..23c0834a97a4 --- /dev/null +++ b/fs/bcachefs/backpointers.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "error.h" + +#include <linux/mm.h> + +static bool extent_matches_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct bpos bucket, + struct bch_backpointer bp) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket2; + struct bch_backpointer bp2; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, + &bucket2, &bp2); + if (bpos_eq(bucket, bucket2) && + !memcmp(&bp, &bp2, sizeof(bp))) + return true; + } + + return false; +} + +int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + int ret = 0; + + bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)), + c, err, + backpointer_pos_wrong, + "backpointer at wrong pos"); +fsck_err: + return ret; +} + +void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +{ + prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", + bch2_btree_id_str(bp->btree_id), + bp->level, + (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), + (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp->bucket_len); + bch2_bpos_to_text(out, bp->pos); +} + +void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + + bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); +} + +void bch2_backpointer_swab(struct bkey_s k) +{ + struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); + + bp.v->bucket_offset = swab40(bp.v->bucket_offset); + bp.v->bucket_len = swab32(bp.v->bucket_len); + bch2_bpos_swab(&bp.v->pos); +} + +static noinline int backpointer_mod_err(struct btree_trans *trans, + struct bch_backpointer bp, + struct bkey_s_c bp_k, + struct bkey_s_c orig_k, + bool insert) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + if (insert) { + prt_printf(&buf, "existing backpointer found when inserting "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "found "); + bch2_bkey_val_to_text(&buf, c, bp_k); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + bch_err(c, "%s", buf.buf); + } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + prt_printf(&buf, "backpointer not found when deleting"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "searching for "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + + prt_printf(&buf, "got "); + bch2_bkey_val_to_text(&buf, c, bp_k); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + bch_err(c, "%s", buf.buf); + } + + printbuf_exit(&buf); + + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + bch2_inconsistent_error(c); + return -EIO; + } else { + return 0; + } +} + +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bkey_i_backpointer *bp_k, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + bool insert) +{ + struct btree_iter bp_iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bp_k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + ret = bkey_err(k); + if (ret) + goto err; + + if (insert + ? k.k->type + : (k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { + ret = backpointer_mod_err(trans, bp, k, orig_k, insert); + if (ret) + goto err; + } + + ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); +err: + bch2_trans_iter_exit(trans, &bp_iter); + return ret; +} + +/* + * Find the next backpointer >= *bp_offset: + */ +int bch2_get_next_backpointer(struct btree_trans *trans, + struct bpos bucket, int gen, + struct bpos *bp_pos, + struct bch_backpointer *bp, + unsigned iter_flags) +{ + struct bch_fs *c = trans->c; + struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; + struct bkey_s_c k; + int ret = 0; + + if (bpos_ge(*bp_pos, bp_end_pos)) + goto done; + + if (gen >= 0) { + k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED|iter_flags); + ret = bkey_err(k); + if (ret) + goto out; + + if (k.k->type != KEY_TYPE_alloc_v4 || + bkey_s_c_to_alloc_v4(k).v->gen != gen) + goto done; + } + + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + + for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, + *bp_pos, iter_flags, k, ret) { + if (bpos_ge(k.k->p, bp_end_pos)) + break; + + *bp_pos = k.k->p; + *bp = *bkey_s_c_to_backpointer(k).v; + goto out; + } +done: + *bp_pos = SPOS_MAX; +out: + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +static void backpointer_not_found(struct btree_trans *trans, + struct bpos bp_pos, + struct bch_backpointer bp, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + + /* + * If we're using the btree write buffer, the backpointer we were + * looking at may have already been deleted - failure to find what it + * pointed to is not an error: + */ + if (likely(!bch2_backpointers_no_use_write_buffer)) + return; + + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", + bp.level ? "btree node" : "extent"); + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, "\n "); + + prt_printf(&buf, "backpointer pos: "); + bch2_bpos_to_text(&buf, bp_pos); + prt_printf(&buf, "\n "); + + bch2_backpointer_to_text(&buf, &bp); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) + bch_err_ratelimited(c, "%s", buf.buf); + else + bch2_trans_inconsistent(trans, "%s", buf.buf); + + printbuf_exit(&buf); +} + +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bp_pos, + struct bch_backpointer bp, + unsigned iter_flags) +{ + if (likely(!bp.level)) { + struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + struct bkey_s_c k; + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, 0, + iter_flags); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; + + bch2_trans_iter_exit(trans, iter); + backpointer_not_found(trans, bp_pos, bp, k); + return bkey_s_c_null; + } else { + struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + + if (IS_ERR_OR_NULL(b)) { + bch2_trans_iter_exit(trans, iter); + return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; + } + return bkey_i_to_s_c(&b->key); + } +} + +struct btree *bch2_backpointer_get_node(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bp_pos, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + struct btree *b; + + BUG_ON(!bp.level); + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, + bp.level - 1, + 0); + b = bch2_btree_iter_peek_node(iter); + if (IS_ERR_OR_NULL(b)) + goto err; + + BUG_ON(b->c.level != bp.level - 1); + + if (extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) + return b; + + if (btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { + backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); + b = NULL; + } +err: + bch2_trans_iter_exit(trans, iter); + return b; +} + +static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bkey_s_c alloc_k; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, + backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } + + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, + bp_pos_to_bucket(c, k.k->p), 0); + ret = bkey_err(alloc_k); + if (ret) + goto out; + + if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, + backpointer_to_missing_alloc, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +} + +/* verify that every backpointer has a corresponding alloc key */ +int bch2_check_btree_backpointers(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_backpointers, POS_MIN, 0, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + bch2_check_btree_backpointer(trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +struct bpos_level { + unsigned level; + struct bpos pos; +}; + +static int check_bp_exists(struct btree_trans *trans, + struct bpos bucket, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + struct bpos bucket_start, + struct bpos bucket_end, + struct bpos_level *last_flushed) +{ + struct bch_fs *c = trans->c; + struct btree_iter bp_iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c bp_k; + int ret; + + if (bpos_lt(bucket, bucket_start) || + bpos_gt(bucket, bucket_end)) + return 0; + + if (!bch2_dev_bucket_exists(c, bucket)) + goto missing; + + bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, bucket, bp.bucket_offset), + 0); + ret = bkey_err(bp_k); + if (ret) + goto err; + + if (bp_k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { + if (last_flushed->level != bp.level || + !bpos_eq(last_flushed->pos, orig_k.k->p)) { + last_flushed->level = bp.level; + last_flushed->pos = orig_k.k->p; + + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + goto missing; + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + return ret; +missing: + prt_printf(&buf, "missing backpointer for btree=%s l=%u ", + bch2_btree_id_str(bp.btree_id), bp.level); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_printf(&buf, "\nbp pos "); + bch2_bpos_to_text(&buf, bp_iter.pos); + + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || + c->opts.reconstruct_alloc || + fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + + goto out; +} + +static int check_extent_to_backpointers(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bucket_start, + struct bpos bucket_end, + struct bpos_level *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek_all_levels(iter); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k, + bucket_start, bucket_end, + last_flushed); + if (ret) + return ret; + } + + return 0; +} + +static int check_btree_root_to_backpointers(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos bucket_start, + struct bpos bucket_end, + struct bpos_level *last_flushed) +{ + struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, btree_id); + struct btree_iter iter; + struct btree *b; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + int ret; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + BUG_ON(b != btree_node_root(c, b)); + + k = bkey_i_to_s_c(&b->key); + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k, + bucket_start, bucket_end, + last_flushed); + if (ret) + goto err; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +{ + return (struct bbpos) { + .btree = bp.btree_id, + .pos = bp.pos, + }; +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + struct sysinfo i; + u64 mem_bytes; + + si_meminfo(&i); + mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes >> 1, btree_bytes(c)); +} + +static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, + unsigned btree_leaf_mask, + unsigned btree_interior_mask, + struct bbpos start, struct bbpos *end) +{ + struct btree_iter iter; + struct bkey_s_c k; + size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); + enum btree_id btree; + int ret = 0; + + for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + + if (!((1U << btree) & btree_leaf_mask) && + !((1U << btree) & btree_interior_mask)) + continue; + + bch2_trans_node_iter_init(trans, &iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, 0); + /* + * for_each_btree_key_contineu() doesn't check the return value + * from bch2_btree_iter_advance(), which is needed when + * iterating over interior nodes where we'll see keys at + * SPOS_MAX: + */ + do { + k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); + ret = bkey_err(k); + if (!k.k || ret) + break; + + --btree_nodes; + if (!btree_nodes) { + *end = BBPOS(btree, k.k->p); + bch2_trans_iter_exit(trans, &iter); + return 0; + } + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(trans, &iter); + } + + *end = BBPOS_MAX; + return ret; +} + +static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + struct bpos bucket_start, + struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + enum btree_id btree_id; + struct bpos_level last_flushed = { UINT_MAX, POS_MIN }; + int ret = 0; + + for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { + unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + depth, + BTREE_ITER_ALL_LEVELS| + BTREE_ITER_PREFETCH); + + do { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_extent_to_backpointers(trans, &iter, + bucket_start, bucket_end, + &last_flushed)); + if (ret) + break; + } while (!bch2_btree_iter_advance(&iter)); + + bch2_trans_iter_exit(trans, &iter); + + if (ret) + break; + + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_btree_root_to_backpointers(trans, btree_id, + bucket_start, bucket_end, + &last_flushed)); + if (ret) + break; + } + return ret; +} + +static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, + struct bpos bucket) +{ + return bch2_dev_exists2(c, bucket.inode) + ? bucket_pos_to_bp(c, bucket, 0) + : bucket; +} + +static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, + struct bpos start, struct bpos *end) +{ + struct btree_iter alloc_iter; + struct btree_iter bp_iter; + struct bkey_s_c alloc_k, bp_k; + size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); + bool alloc_end = false, bp_end = false; + int ret = 0; + + bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + start, 0, 1, 0); + bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); + while (1) { + alloc_k = !alloc_end + ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) + : bkey_s_c_null; + bp_k = !bp_end + ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) + : bkey_s_c_null; + + ret = bkey_err(alloc_k) ?: bkey_err(bp_k); + if ((!alloc_k.k && !bp_k.k) || ret) { + *end = SPOS_MAX; + break; + } + + --btree_nodes; + if (!btree_nodes) { + *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; + break; + } + + if (bpos_lt(alloc_iter.pos, SPOS_MAX) && + bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { + if (!bch2_btree_iter_advance(&alloc_iter)) + alloc_end = true; + } else { + if (!bch2_btree_iter_advance(&bp_iter)) + bp_end = true; + } + } + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +int bch2_check_extents_to_backpointers(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct bpos start = POS_MIN, end; + int ret; + + while (1) { + ret = bch2_get_alloc_in_memory_pos(trans, start, &end); + if (ret) + break; + + if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) + bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", + __func__, btree_nodes_fit_in_ram(c)); + + if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "check_extents_to_backpointers(): "); + bch2_bpos_to_text(&buf, start); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, end); + + bch_verbose(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + ret = bch2_check_extents_to_backpointers_pass(trans, start, end); + if (ret || bpos_eq(end, SPOS_MAX)) + break; + + start = bpos_successor(end); + } + bch2_trans_put(trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int check_one_backpointer(struct btree_trans *trans, + struct bbpos start, + struct bbpos end, + struct bkey_s_c_backpointer bp, + struct bpos *last_flushed_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bbpos pos = bp_to_bbpos(*bp.v); + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + if (bbpos_cmp(pos, start) < 0 || + bbpos_cmp(pos, end) > 0) + return 0; + + k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); + ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { + *last_flushed_pos = bp.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + + if (fsck_err_on(!k.k, c, + backpointer_to_missing_ptr, + "backpointer for missing %s\n %s", + bp.v->level ? "btree node" : "extent", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, + struct bbpos start, + struct bbpos end) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bpos last_flushed_pos = SPOS_MAX; + + return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_one_backpointer(trans, start, end, + bkey_s_c_to_backpointer(k), + &last_flushed_pos)); +} + +int bch2_check_backpointers_to_extents(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; + int ret; + + while (1) { + ret = bch2_get_btree_in_memory_pos(trans, + (1U << BTREE_ID_extents)| + (1U << BTREE_ID_reflink), + ~0, + start, &end); + if (ret) + break; + + if (!bbpos_cmp(start, BBPOS_MIN) && + bbpos_cmp(end, BBPOS_MAX)) + bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", + __func__, btree_nodes_fit_in_ram(c)); + + if (bbpos_cmp(start, BBPOS_MIN) || + bbpos_cmp(end, BBPOS_MAX)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "check_backpointers_to_extents(): "); + bch2_bbpos_to_text(&buf, start); + prt_str(&buf, "-"); + bch2_bbpos_to_text(&buf, end); + + bch_verbose(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + ret = bch2_check_backpointers_to_extents_pass(trans, start, end); + if (ret || !bbpos_cmp(end, BBPOS_MAX)) + break; + + start = bbpos_successor(end); + } + bch2_trans_put(trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h new file mode 100644 index 000000000000..ab866feeaf66 --- /dev/null +++ b/fs/bcachefs/backpointers.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H + +#include "btree_iter.h" +#include "btree_update.h" +#include "buckets.h" +#include "super.h" + +static inline u64 swab40(u64 x) +{ + return (((x & 0x00000000ffULL) << 32)| + ((x & 0x000000ff00ULL) << 16)| + ((x & 0x0000ff0000ULL) >> 0)| + ((x & 0x00ff000000ULL) >> 16)| + ((x & 0xff00000000ULL) >> 32)); +} + +int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k, + enum bkey_invalid_flags, struct printbuf *); +void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_backpointer_swab(struct bkey_s); + +#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ + .key_invalid = bch2_backpointer_invalid, \ + .val_to_text = bch2_backpointer_k_to_text, \ + .swab = bch2_backpointer_swab, \ + .min_val_size = 32, \ +}) + +#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 + +/* + * Convert from pos in backpointer btree to pos of corresponding bucket in alloc + * btree: + */ +static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, + struct bpos bp_pos) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); +} + +/* + * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: + */ +static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, + struct bpos bucket, + u64 bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + struct bpos ret; + + ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + + EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); + + return ret; +} + +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *, + struct bch_backpointer, struct bkey_s_c, bool); + +static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, + struct bpos bucket, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + bool insert) +{ + struct bch_fs *c = trans->c; + struct bkey_i_backpointer *bp_k; + int ret; + + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + + if (unlikely(bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert); + + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); +} + +static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p) +{ + return level ? BCH_DATA_btree : + p.has_ec ? BCH_DATA_stripe : + BCH_DATA_user; +} + +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + struct bpos *bucket_pos, struct bch_backpointer *bp) +{ + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + s64 sectors = level ? btree_sectors(c) : k.k->size; + u32 bucket_offset; + + *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bp = (struct bch_backpointer) { + .btree_id = btree_id, + .level = level, + .data_type = data_type, + .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + + p.crc.offset, + .bucket_len = ptr_disk_sectors(sectors, p), + .pos = k.k->p, + }; +} + +int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, + struct bpos *, struct bch_backpointer *, unsigned); +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_backpointer, + unsigned); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_backpointer); + +int bch2_check_btree_backpointers(struct bch_fs *); +int bch2_check_extents_to_backpointers(struct bch_fs *); +int bch2_check_backpointers_to_extents(struct bch_fs *); + +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h new file mode 100644 index 000000000000..be2edced5213 --- /dev/null +++ b/fs/bcachefs/bbpos.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BBPOS_H +#define _BCACHEFS_BBPOS_H + +#include "bbpos_types.h" +#include "bkey_methods.h" +#include "btree_cache.h" + +static inline int bbpos_cmp(struct bbpos l, struct bbpos r) +{ + return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); +} + +static inline struct bbpos bbpos_successor(struct bbpos pos) +{ + if (bpos_cmp(pos.pos, SPOS_MAX)) { + pos.pos = bpos_successor(pos.pos); + return pos; + } + + if (pos.btree != BTREE_ID_NR) { + pos.btree++; + pos.pos = POS_MIN; + return pos; + } + + BUG(); +} + +static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) +{ + prt_str(out, bch2_btree_id_str(pos.btree)); + prt_char(out, ':'); + bch2_bpos_to_text(out, pos.pos); +} + +#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h new file mode 100644 index 000000000000..5198e94cf3b8 --- /dev/null +++ b/fs/bcachefs/bbpos_types.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BBPOS_TYPES_H +#define _BCACHEFS_BBPOS_TYPES_H + +struct bbpos { + enum btree_id btree; + struct bpos pos; +}; + +static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) +{ + return (struct bbpos) { btree, pos }; +} + +#define BBPOS_MIN BBPOS(0, POS_MIN) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) + +#endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 index 000000000000..dfa22f9d9a1d --- /dev/null +++ b/fs/bcachefs/bcachefs.h @@ -0,0 +1,1163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H + +/* + * SOME HIGH LEVEL CODE DOCUMENTATION: + * + * Bcache mostly works with cache sets, cache devices, and backing devices. + * + * Support for multiple cache devices hasn't quite been finished off yet, but + * it's about 95% plumbed through. A cache set and its cache devices is sort of + * like a md raid array and its component devices. Most of the code doesn't care + * about individual cache devices, the main abstraction is the cache set. + * + * Multiple cache devices is intended to give us the ability to mirror dirty + * cached data and metadata, without mirroring clean cached data. + * + * Backing devices are different, in that they have a lifetime independent of a + * cache set. When you register a newly formatted backing device it'll come up + * in passthrough mode, and then you can attach and detach a backing device from + * a cache set at runtime - while it's mounted and in use. Detaching implicitly + * invalidates any cached data for that backing device. + * + * A cache set can have multiple (many) backing devices attached to it. + * + * There's also flash only volumes - this is the reason for the distinction + * between struct cached_dev and struct bcache_device. A flash only volume + * works much like a bcache device that has a backing device, except the + * "cached" data is always dirty. The end result is that we get thin + * provisioning with very little additional code. + * + * Flash only volumes work but they're not production ready because the moving + * garbage collector needs more work. More on that later. + * + * BUCKETS/ALLOCATION: + * + * Bcache is primarily designed for caching, which means that in normal + * operation all of our available space will be allocated. Thus, we need an + * efficient way of deleting things from the cache so we can write new things to + * it. + * + * To do this, we first divide the cache device up into buckets. A bucket is the + * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ + * works efficiently. + * + * Each bucket has a 16 bit priority, and an 8 bit generation associated with + * it. The gens and priorities for all the buckets are stored contiguously and + * packed on disk (in a linked list of buckets - aside from the superblock, all + * of bcache's metadata is stored in buckets). + * + * The priority is used to implement an LRU. We reset a bucket's priority when + * we allocate it or on cache it, and every so often we decrement the priority + * of each bucket. It could be used to implement something more sophisticated, + * if anyone ever gets around to it. + * + * The generation is used for invalidating buckets. Each pointer also has an 8 + * bit generation embedded in it; for a pointer to be considered valid, its gen + * must match the gen of the bucket it points into. Thus, to reuse a bucket all + * we have to do is increment its gen (and write its new gen to disk; we batch + * this up). + * + * Bcache is entirely COW - we never write twice to a bucket, even buckets that + * contain metadata (including btree nodes). + * + * THE BTREE: + * + * Bcache is in large part design around the btree. + * + * At a high level, the btree is just an index of key -> ptr tuples. + * + * Keys represent extents, and thus have a size field. Keys also have a variable + * number of pointers attached to them (potentially zero, which is handy for + * invalidating the cache). + * + * The key itself is an inode:offset pair. The inode number corresponds to a + * backing device or a flash only volume. The offset is the ending offset of the + * extent within the inode - not the starting offset; this makes lookups + * slightly more convenient. + * + * Pointers contain the cache device id, the offset on that device, and an 8 bit + * generation number. More on the gen later. + * + * Index lookups are not fully abstracted - cache lookups in particular are + * still somewhat mixed in with the btree code, but things are headed in that + * direction. + * + * Updates are fairly well abstracted, though. There are two different ways of + * updating the btree; insert and replace. + * + * BTREE_INSERT will just take a list of keys and insert them into the btree - + * overwriting (possibly only partially) any extents they overlap with. This is + * used to update the index after a write. + * + * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is + * overwriting a key that matches another given key. This is used for inserting + * data into the cache after a cache miss, and for background writeback, and for + * the moving garbage collector. + * + * There is no "delete" operation; deleting things from the index is + * accomplished by either by invalidating pointers (by incrementing a bucket's + * gen) or by inserting a key with 0 pointers - which will overwrite anything + * previously present at that location in the index. + * + * This means that there are always stale/invalid keys in the btree. They're + * filtered out by the code that iterates through a btree node, and removed when + * a btree node is rewritten. + * + * BTREE NODES: + * + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and + * free smaller than a bucket - so, that's how big our btree nodes are. + * + * (If buckets are really big we'll only use part of the bucket for a btree node + * - no less than 1/4th - but a bucket still contains no more than a single + * btree node. I'd actually like to change this, but for now we rely on the + * bucket's gen for deleting btree nodes when we rewrite/split a node.) + * + * Anyways, btree nodes are big - big enough to be inefficient with a textbook + * btree implementation. + * + * The way this is solved is that btree nodes are internally log structured; we + * can append new keys to an existing btree node without rewriting it. This + * means each set of keys we write is sorted, but the node is not. + * + * We maintain this log structure in memory - keeping 1Mb of keys sorted would + * be expensive, and we have to distinguish between the keys we have written and + * the keys we haven't. So to do a lookup in a btree node, we have to search + * each sorted set. But we do merge written sets together lazily, so the cost of + * these extra searches is quite low (normally most of the keys in a btree node + * will be in one big set, and then there'll be one or two sets that are much + * smaller). + * + * This log structure makes bcache's btree more of a hybrid between a + * conventional btree and a compacting data structure, with some of the + * advantages of both. + * + * GARBAGE COLLECTION: + * + * We can't just invalidate any bucket - it might contain dirty data or + * metadata. If it once contained dirty data, other writes might overwrite it + * later, leaving no valid pointers into that bucket in the index. + * + * Thus, the primary purpose of garbage collection is to find buckets to reuse. + * It also counts how much valid data it each bucket currently contains, so that + * allocation can reuse buckets sooner when they've been mostly overwritten. + * + * It also does some things that are really internal to the btree + * implementation. If a btree node contains pointers that are stale by more than + * some threshold, it rewrites the btree node to avoid the bucket's generation + * wrapping around. It also merges adjacent btree nodes if they're empty enough. + * + * THE JOURNAL: + * + * Bcache's journal is not necessary for consistency; we always strictly + * order metadata writes so that the btree and everything else is consistent on + * disk in the event of an unclean shutdown, and in fact bcache had writeback + * caching (with recovery from unclean shutdown) before journalling was + * implemented. + * + * Rather, the journal is purely a performance optimization; we can't complete a + * write until we've updated the index on disk, otherwise the cache would be + * inconsistent in the event of an unclean shutdown. This means that without the + * journal, on random write workloads we constantly have to update all the leaf + * nodes in the btree, and those writes will be mostly empty (appending at most + * a few keys each) - highly inefficient in terms of amount of metadata writes, + * and it puts more strain on the various btree resorting/compacting code. + * + * The journal is just a log of keys we've inserted; on startup we just reinsert + * all the keys in the open journal entries. That means that when we're updating + * a node in the btree, we can wait until a 4k block of keys fills up before + * writing them out. + * + * For simplicity, we only journal updates to leaf nodes; updates to parent + * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth + * the complexity to deal with journalling them (in particular, journal replay) + * - updates to non leaf nodes just happen synchronously (see btree_split()). + */ + +#undef pr_fmt +#ifdef __KERNEL__ +#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ +#else +#define pr_fmt(fmt) "%s() " fmt "\n", __func__ +#endif + +#include <linux/backing-dev-defs.h> +#include <linux/bug.h> +#include <linux/bio.h> +#include <linux/closure.h> +#include <linux/kobject.h> +#include <linux/list.h> +#include <linux/math64.h> +#include <linux/mutex.h> +#include <linux/percpu-refcount.h> +#include <linux/percpu-rwsem.h> +#include <linux/rhashtable.h> +#include <linux/rwsem.h> +#include <linux/semaphore.h> +#include <linux/seqlock.h> +#include <linux/shrinker.h> +#include <linux/srcu.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/zstd.h> + +#include "bcachefs_format.h" +#include "errcode.h" +#include "fifo.h" +#include "nocow_locking_types.h" +#include "opts.h" +#include "recovery_types.h" +#include "sb-errors_types.h" +#include "seqmutex.h" +#include "util.h" + +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCH_WRITE_REF_DEBUG +#endif + +#ifndef dynamic_fault +#define dynamic_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") + +#define trace_and_count(_c, _name, ...) \ +do { \ + this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ + trace_##_name(__VA_ARGS__); \ +} while (0) + +#define bch2_fs_init_fault(name) \ + dynamic_fault("bcachefs:bch_fs_init:" name) +#define bch2_meta_read_fault(name) \ + dynamic_fault("bcachefs:meta:read:" name) +#define bch2_meta_write_fault(name) \ + dynamic_fault("bcachefs:meta:write:" name) + +#ifdef __KERNEL__ +#define BCACHEFS_LOG_PREFIX +#endif + +#ifdef BCACHEFS_LOG_PREFIX + +#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) +#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name) +#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset) +#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) +#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ + "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset) + +#else + +#define bch2_log_msg(_c, fmt) fmt +#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name) +#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset) +#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) +#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ + "inum %llu offset %llu: " fmt "\n", (_inum), (_offset) + +#endif + +#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") + +#define bch_info(c, fmt, ...) \ + printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_notice(c, fmt, ...) \ + printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_warn(c, fmt, ...) \ + printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_warn_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + +#define bch_err(c, fmt, ...) \ + printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_dev(ca, fmt, ...) \ + printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) +#define bch_err_dev_offset(ca, _offset, fmt, ...) \ + printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) +#define bch_err_inum(c, _inum, fmt, ...) \ + printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) +#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ + printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +#define bch_err_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_dev_ratelimited(ca, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) +#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) +#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) +#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +#define bch_err_fn(_c, _ret) \ +do { \ + if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\ +} while (0) + +#define bch_err_msg(_c, _ret, _msg, ...) \ +do { \ + if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + bch_err(_c, "%s(): error " _msg " %s", __func__, \ + ##__VA_ARGS__, bch2_err_str(_ret)); \ +} while (0) + +#define bch_verbose(c, fmt, ...) \ +do { \ + if ((c)->opts.verbose) \ + bch_info(c, fmt, ##__VA_ARGS__); \ +} while (0) + +#define pr_verbose_init(opts, fmt, ...) \ +do { \ + if (opt_get(opts, verbose)) \ + pr_info(fmt, ##__VA_ARGS__); \ +} while (0) + +/* Parameters that are useful for debugging, but should always be compiled in: */ +#define BCH_DEBUG_PARAMS_ALWAYS() \ + BCH_DEBUG_PARAM(key_merging_disabled, \ + "Disables merging of extents") \ + BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ + "Causes mark and sweep to compact and rewrite every " \ + "btree node it traverses") \ + BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ + "Disables rewriting of btree nodes during mark and sweep")\ + BCH_DEBUG_PARAM(btree_shrinker_disabled, \ + "Disables the shrinker callback for the btree node cache")\ + BCH_DEBUG_PARAM(verify_btree_ondisk, \ + "Reread btree nodes at various points to verify the " \ + "mergesort in the read path against modifications " \ + "done in memory") \ + BCH_DEBUG_PARAM(verify_all_btree_replicas, \ + "When reading btree nodes, read all replicas and " \ + "compare them") \ + BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ + "Don't use the write buffer for backpointers, enabling "\ + "extra runtime checks") + +/* Parameters that should only be compiled in debug mode: */ +#define BCH_DEBUG_PARAMS_DEBUG() \ + BCH_DEBUG_PARAM(expensive_debug_checks, \ + "Enables various runtime debugging checks that " \ + "significantly affect performance") \ + BCH_DEBUG_PARAM(debug_check_iterators, \ + "Enables extra verification for btree iterators") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(journal_seq_verify, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(inject_invalid_keys, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(test_alloc_startup, \ + "Force allocator startup to use the slowpath where it" \ + "can't find enough free buckets without invalidating" \ + "cached data") \ + BCH_DEBUG_PARAM(force_reconstruct_read, \ + "Force reads to use the reconstruct path, when reading" \ + "from erasure coded extents") \ + BCH_DEBUG_PARAM(test_restart_gc, \ + "Test restarting mark and sweep gc when bucket gens change") + +#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() + +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() +#else +#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() +#endif + +#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +#ifndef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; +BCH_DEBUG_PARAMS_DEBUG() +#undef BCH_DEBUG_PARAM +#endif + +#define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_node_split) \ + x(btree_node_compact) \ + x(btree_node_merge) \ + x(btree_node_sort) \ + x(btree_node_read) \ + x(btree_interior_update_foreground) \ + x(btree_interior_update_total) \ + x(btree_gc) \ + x(data_write) \ + x(data_read) \ + x(data_promote) \ + x(journal_flush_write) \ + x(journal_noflush_write) \ + x(journal_flush_seq) \ + x(blocked_journal) \ + x(blocked_allocate) \ + x(blocked_allocate_open_bucket) \ + x(nocow_lock_contended) + +enum bch_time_stats { +#define x(name) BCH_TIME_##name, + BCH_TIME_STATS() +#undef x + BCH_TIME_STAT_NR +}; + +#include "alloc_types.h" +#include "btree_types.h" +#include "btree_write_buffer_types.h" +#include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" +#include "disk_groups_types.h" +#include "ec_types.h" +#include "journal_types.h" +#include "keylist_types.h" +#include "quota_types.h" +#include "rebalance_types.h" +#include "replicas_types.h" +#include "subvolume_types.h" +#include "super_types.h" + +/* Number of nodes btree coalesce will try to coalesce at once */ +#define GC_MERGE_NODES 4U + +/* Maximum number of nodes we might need to allocate atomically: */ +#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) + +/* Size of the freelist we allocate btree nodes from: */ +#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) + +#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) + +struct btree; + +enum gc_phase { + GC_PHASE_NOT_RUNNING, + GC_PHASE_START, + GC_PHASE_SB, + + GC_PHASE_BTREE_stripes, + GC_PHASE_BTREE_extents, + GC_PHASE_BTREE_inodes, + GC_PHASE_BTREE_dirents, + GC_PHASE_BTREE_xattrs, + GC_PHASE_BTREE_alloc, + GC_PHASE_BTREE_quotas, + GC_PHASE_BTREE_reflink, + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, + GC_PHASE_BTREE_lru, + GC_PHASE_BTREE_freespace, + GC_PHASE_BTREE_need_discard, + GC_PHASE_BTREE_backpointers, + GC_PHASE_BTREE_bucket_gens, + GC_PHASE_BTREE_snapshot_trees, + GC_PHASE_BTREE_deleted_inodes, + GC_PHASE_BTREE_logged_ops, + GC_PHASE_BTREE_rebalance_work, + + GC_PHASE_PENDING_DELETE, +}; + +struct gc_pos { + enum gc_phase phase; + struct bpos pos; + unsigned level; +}; + +struct reflink_gc { + u64 offset; + u32 size; + u32 refcount; +}; + +typedef GENRADIX(struct reflink_gc) reflink_gc_table; + +struct io_count { + u64 sectors[2][BCH_DATA_NR]; +}; + +struct bch_dev { + struct kobject kobj; + struct percpu_ref ref; + struct completion ref_completion; + struct percpu_ref io_ref; + struct completion io_ref_completion; + + struct bch_fs *fs; + + u8 dev_idx; + /* + * Cached version of this device's member info from superblock + * Committed by bch2_write_super() -> bch_fs_mi_update() + */ + struct bch_member_cpu mi; + atomic64_t errors[BCH_MEMBER_ERROR_NR]; + + __uuid_t uuid; + char name[BDEVNAME_SIZE]; + + struct bch_sb_handle disk_sb; + struct bch_sb *sb_read_scratch; + int sb_write_error; + dev_t dev; + atomic_t flush_seq; + + struct bch_devs_mask self; + + /* biosets used in cloned bios for writing multiple replicas */ + struct bio_set replica_set; + + /* + * Buckets: + * Per-bucket arrays are protected by c->mark_lock, bucket_lock and + * gc_lock, for device resize - holding any is sufficient for access: + * Or rcu_read_lock(), but only for ptr_stale(): + */ + struct bucket_array __rcu *buckets_gc; + struct bucket_gens __rcu *bucket_gens; + u8 *oldest_gen; + unsigned long *buckets_nouse; + struct rw_semaphore bucket_lock; + + struct bch_dev_usage *usage_base; + struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_dev_usage __percpu *usage_gc; + + /* Allocator: */ + u64 new_fs_bucket_idx; + u64 alloc_cursor; + + unsigned nr_open_buckets; + unsigned nr_btree_reserve; + + size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; + size_t buckets_waiting_on_journal; + + atomic64_t rebalance_work; + + struct journal_device journal; + u64 prev_journal_sector; + + struct work_struct io_error_work; + + /* The rest of this all shows up in sysfs */ + atomic64_t cur_latency[2]; + struct bch2_time_stats io_latency[2]; + +#define CONGESTED_MAX 1024 + atomic_t congested; + u64 congested_last; + + struct io_count __percpu *io_done; +}; + +enum { + /* startup: */ + BCH_FS_STARTED, + BCH_FS_MAY_GO_RW, + BCH_FS_RW, + BCH_FS_WAS_RW, + + /* shutdown: */ + BCH_FS_STOPPING, + BCH_FS_EMERGENCY_RO, + BCH_FS_GOING_RO, + BCH_FS_WRITE_DISABLE_COMPLETE, + BCH_FS_CLEAN_SHUTDOWN, + + /* fsck passes: */ + BCH_FS_FSCK_DONE, + BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ + BCH_FS_NEED_ANOTHER_GC, + + BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, + + /* errors: */ + BCH_FS_ERROR, + BCH_FS_TOPOLOGY_ERROR, + BCH_FS_ERRORS_FIXED, + BCH_FS_ERRORS_NOT_FIXED, +}; + +struct btree_debug { + unsigned id; +}; + +#define BCH_TRANSACTIONS_NR 128 + +struct btree_transaction_stats { + struct bch2_time_stats lock_hold_times; + struct mutex lock; + unsigned nr_max_paths; + unsigned wb_updates_size; + unsigned max_mem; + char *max_paths_text; +}; + +struct bch_fs_pcpu { + u64 sectors_available; +}; + +struct journal_seq_blacklist_table { + size_t nr; + struct journal_seq_blacklist_table_entry { + u64 start; + u64 end; + bool dirty; + } entries[]; +}; + +struct journal_keys { + struct journal_key { + u64 journal_seq; + u32 journal_offset; + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; + bool overwritten; + struct bkey_i *k; + } *d; + /* + * Gap buffer: instead of all the empty space in the array being at the + * end of the buffer - from @nr to @size - the empty space is at @gap. + * This means that sequential insertions are O(n) instead of O(n^2). + */ + size_t gap; + size_t nr; + size_t size; + atomic_t ref; + bool initial_ref_held; +}; + +struct btree_trans_buf { + struct btree_trans *trans; +}; + +#define REPLICAS_DELTA_LIST_MAX (1U << 16) + +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + +#define BCH_WRITE_REFS() \ + x(trans) \ + x(write) \ + x(promote) \ + x(node_rewrite) \ + x(stripe_create) \ + x(stripe_delete) \ + x(reflink) \ + x(fallocate) \ + x(discard) \ + x(invalidate) \ + x(delete_dead_snapshots) \ + x(snapshot_delete_pagecache) \ + x(sysfs) + +enum bch_write_ref { +#define x(n) BCH_WRITE_REF_##n, + BCH_WRITE_REFS() +#undef x + BCH_WRITE_REF_NR, +}; + +struct bch_fs { + struct closure cl; + + struct list_head list; + struct kobject kobj; + struct kobject counters_kobj; + struct kobject internal; + struct kobject opts_dir; + struct kobject time_stats; + unsigned long flags; + + int minor; + struct device *chardev; + struct super_block *vfs_sb; + dev_t dev; + char name[40]; + + /* ro/rw, add/remove/resize devices: */ + struct rw_semaphore state_lock; + + /* Counts outstanding writes, for clean transition to read-only */ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_t writes[BCH_WRITE_REF_NR]; +#else + struct percpu_ref writes; +#endif + struct work_struct read_only_work; + + struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + + struct bch_replicas_cpu replicas; + struct bch_replicas_cpu replicas_gc; + struct mutex replicas_gc_lock; + mempool_t replicas_delta_pool; + + struct journal_entry_res btree_root_journal_res; + struct journal_entry_res replicas_journal_res; + struct journal_entry_res clock_journal_res; + struct journal_entry_res dev_usage_journal_res; + + struct bch_disk_groups_cpu __rcu *disk_groups; + + struct bch_opts opts; + + /* Updated by bch2_sb_update():*/ + struct { + __uuid_t uuid; + __uuid_t user_uuid; + + u16 version; + u16 version_min; + u16 version_upgrade_complete; + + u8 nr_devices; + u8 clean; + + u8 encryption_type; + + u64 time_base_lo; + u32 time_base_hi; + unsigned time_units_per_sec; + unsigned nsec_per_time_unit; + u64 features; + u64 compat; + } sb; + + + struct bch_sb_handle disk_sb; + + unsigned short block_bits; /* ilog2(block_size) */ + + u16 btree_foreground_merge_threshold; + + struct closure sb_write; + struct mutex sb_lock; + + /* snapshot.c: */ + struct snapshot_table __rcu *snapshots; + size_t snapshot_table_size; + struct mutex snapshot_table_lock; + struct rw_semaphore snapshot_create_lock; + + struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; + snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; + + /* BTREE CACHE */ + struct bio_set btree_bio; + struct workqueue_struct *io_complete_wq; + + struct btree_root btree_roots_known[BTREE_ID_NR]; + DARRAY(struct btree_root) btree_roots_extra; + struct mutex btree_root_lock; + + struct btree_cache btree_cache; + + /* + * Cache of allocated btree nodes - if we allocate a btree node and + * don't use it, if we free it that space can't be reused until going + * _all_ the way through the allocator (which exposes us to a livelock + * when allocating btree reserves fail halfway through) - instead, we + * can stick them here: + */ + struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; + unsigned btree_reserve_cache_nr; + struct mutex btree_reserve_cache_lock; + + mempool_t btree_interior_update_pool; + struct list_head btree_interior_update_list; + struct list_head btree_interior_updates_unwritten; + struct mutex btree_interior_update_lock; + struct closure_waitlist btree_interior_update_wait; + + struct workqueue_struct *btree_interior_update_worker; + struct work_struct btree_interior_update_work; + + struct list_head pending_node_rewrites; + struct mutex pending_node_rewrites_lock; + + /* btree_io.c: */ + spinlock_t btree_write_error_lock; + struct btree_write_stats { + atomic64_t nr; + atomic64_t bytes; + } btree_write_stats[BTREE_WRITE_TYPE_NR]; + + /* btree_iter.c: */ + struct seqmutex btree_trans_lock; + struct list_head btree_trans_list; + mempool_t btree_trans_pool; + mempool_t btree_trans_mem_pool; + struct btree_trans_buf __percpu *btree_trans_bufs; + + struct srcu_struct btree_trans_barrier; + bool btree_trans_barrier_initialized; + + struct btree_key_cache btree_key_cache; + unsigned btree_key_cache_btrees; + + struct btree_write_buffer btree_write_buffer; + + struct workqueue_struct *btree_update_wq; + struct workqueue_struct *btree_io_complete_wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; + /* + * Use a dedicated wq for write ref holder tasks. Required to avoid + * dependency problems with other wq tasks that can block on ref + * draining, such as read-only transition. + */ + struct workqueue_struct *write_ref_wq; + + /* ALLOCATION */ + struct bch_devs_mask rw_devs[BCH_DATA_NR]; + + u64 capacity; /* sectors */ + + /* + * When capacity _decreases_ (due to a disk being removed), we + * increment capacity_gen - this invalidates outstanding reservations + * and forces them to be revalidated + */ + u32 capacity_gen; + unsigned bucket_size_max; + + atomic64_t sectors_available; + struct mutex sectors_available_lock; + + struct bch_fs_pcpu __percpu *pcpu; + + struct percpu_rw_semaphore mark_lock; + + seqcount_t usage_lock; + struct bch_fs_usage *usage_base; + struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_fs_usage __percpu *usage_gc; + u64 __percpu *online_reserved; + + /* single element mempool: */ + struct mutex usage_scratch_lock; + struct bch_fs_usage_online *usage_scratch; + + struct io_clock io_clock[2]; + + /* JOURNAL SEQ BLACKLIST */ + struct journal_seq_blacklist_table * + journal_seq_blacklist_table; + struct work_struct journal_seq_blacklist_gc_work; + + /* ALLOCATOR */ + spinlock_t freelist_lock; + struct closure_waitlist freelist_wait; + u64 blocked_allocate; + u64 blocked_allocate_open_bucket; + + open_bucket_idx_t open_buckets_freelist; + open_bucket_idx_t open_buckets_nr_free; + struct closure_waitlist open_buckets_wait; + struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; + + open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial_nr; + + struct write_point btree_write_point; + struct write_point rebalance_write_point; + + struct write_point write_points[WRITE_POINT_MAX]; + struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; + struct mutex write_points_hash_lock; + unsigned write_points_nr; + + struct buckets_waiting_for_journal buckets_waiting_for_journal; + struct work_struct discard_work; + struct work_struct invalidate_work; + + /* GARBAGE COLLECTION */ + struct task_struct *gc_thread; + atomic_t kick_gc; + unsigned long gc_count; + + enum btree_id gc_gens_btree; + struct bpos gc_gens_pos; + + /* + * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] + * has been marked by GC. + * + * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) + * + * Protected by gc_pos_lock. Only written to by GC thread, so GC thread + * can read without a lock. + */ + seqcount_t gc_pos_lock; + struct gc_pos gc_pos; + + /* + * The allocation code needs gc_mark in struct bucket to be correct, but + * it's not while a gc is in progress. + */ + struct rw_semaphore gc_lock; + struct mutex gc_gens_lock; + + /* IO PATH */ + struct semaphore io_in_flight; + struct bio_set bio_read; + struct bio_set bio_read_split; + struct bio_set bio_write; + struct mutex bio_bounce_pages_lock; + mempool_t bio_bounce_pages; + struct bucket_nocow_lock_table + nocow_locks; + struct rhashtable promote_table; + + mempool_t compression_bounce[2]; + mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; + mempool_t decompress_workspace; + size_t zstd_workspace_size; + + struct crypto_shash *sha256; + struct crypto_sync_skcipher *chacha20; + struct crypto_shash *poly1305; + + atomic64_t key_version; + + mempool_t large_bkey_pool; + + /* MOVE.C */ + struct list_head moving_context_list; + struct mutex moving_context_lock; + + /* REBALANCE */ + struct bch_fs_rebalance rebalance; + + /* COPYGC */ + struct task_struct *copygc_thread; + struct write_point copygc_write_point; + s64 copygc_wait_at; + s64 copygc_wait; + bool copygc_running; + wait_queue_head_t copygc_running_wq; + + /* STRIPES: */ + GENRADIX(struct stripe) stripes; + GENRADIX(struct gc_stripe) gc_stripes; + + struct hlist_head ec_stripes_new[32]; + spinlock_t ec_stripes_new_lock; + + ec_stripes_heap ec_stripes_heap; + struct mutex ec_stripes_heap_lock; + + /* ERASURE CODING */ + struct list_head ec_stripe_head_list; + struct mutex ec_stripe_head_lock; + + struct list_head ec_stripe_new_list; + struct mutex ec_stripe_new_lock; + wait_queue_head_t ec_stripe_new_wait; + + struct work_struct ec_stripe_create_work; + u64 ec_stripe_hint; + + struct work_struct ec_stripe_delete_work; + + struct bio_set ec_bioset; + + /* REFLINK */ + reflink_gc_table reflink_gc_table; + size_t reflink_gc_nr; + + /* fs.c */ + struct list_head vfs_inodes_list; + struct mutex vfs_inodes_lock; + + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; + struct bio_set dio_write_bioset; + struct bio_set dio_read_bioset; + struct bio_set nocow_flush_bioset; + + /* QUOTAS */ + struct bch_memquota_type quotas[QTYP_NR]; + + /* RECOVERY */ + u64 journal_replay_seq_start; + u64 journal_replay_seq_end; + enum bch_recovery_pass curr_recovery_pass; + /* bitmap of explicitly enabled recovery passes: */ + u64 recovery_passes_explicit; + u64 recovery_passes_complete; + + /* DEBUG JUNK */ + struct dentry *fs_debug_dir; + struct dentry *btree_debug_dir; + struct btree_debug btree_debug[BTREE_ID_NR]; + struct btree *verify_data; + struct btree_node *verify_ondisk; + struct mutex verify_lock; + + u64 *unused_inode_hints; + unsigned inode_shard_bits; + + /* + * A btree node on disk could have too many bsets for an iterator to fit + * on the stack - have to dynamically allocate them + */ + mempool_t fill_iter; + + mempool_t btree_bounce_pool; + + struct journal journal; + GENRADIX(struct journal_replay *) journal_entries; + u64 journal_entries_base_seq; + struct journal_keys journal_keys; + struct list_head journal_iters; + + u64 last_bucket_seq_cleanup; + + u64 counters_on_mount[BCH_COUNTER_NR]; + u64 __percpu *counters; + + unsigned btree_gc_periodic:1; + unsigned copy_gc_enabled:1; + bool promote_whole_extents; + + struct bch2_time_stats times[BCH_TIME_STAT_NR]; + + struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; + + /* ERRORS */ + struct list_head fsck_error_msgs; + struct mutex fsck_error_msgs_lock; + bool fsck_alloc_msgs_err; + + bch_sb_errors_cpu fsck_error_counts; + struct mutex fsck_error_counts_lock; +}; + +extern struct wait_queue_head bch2_read_only_wait; + +static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_inc(&c->writes[ref]); +#else + percpu_ref_get(&c->writes); +#endif +} + +static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_GOING_RO, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget_live(&c->writes); +#endif +} + +static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + long v = atomic_long_dec_return(&c->writes[ref]); + + BUG_ON(v < 0); + if (v) + return; + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + if (atomic_long_read(&c->writes[i])) + return; + + set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch2_read_only_wait); +#else + percpu_ref_put(&c->writes); +#endif +} + +static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) +{ +#ifndef NO_BCACHEFS_FS + if (c->vfs_sb) + c->vfs_sb->s_bdi->ra_pages = ra_pages; +#endif +} + +static inline unsigned bucket_bytes(const struct bch_dev *ca) +{ + return ca->mi.bucket_size << 9; +} + +static inline unsigned block_bytes(const struct bch_fs *c) +{ + return c->opts.block_size; +} + +static inline unsigned block_sectors(const struct bch_fs *c) +{ + return c->opts.block_size >> 9; +} + +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> 9; +} + +static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) +{ + return c->btree_key_cache_btrees & (1U << btree); +} + +static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) +{ + struct timespec64 t; + s32 rem; + + time += c->sb.time_base_lo; + + t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); + t.tv_nsec = rem * c->sb.nsec_per_time_unit; + return t; +} + +static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) +{ + return (ts.tv_sec * c->sb.time_units_per_sec + + (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; +} + +static inline s64 bch2_current_time(const struct bch_fs *c) +{ + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + return timespec_to_bch2_time(c, now); +} + +static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) +{ + return dev < c->sb.nr_devices && c->devs[dev]; +} + +#define BKEY_PADDED_ONSTACK(key, pad) \ + struct { struct bkey_i key; __u64 key ## _pad[pad]; } + +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 index 000000000000..1ab1f08d763b --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h @@ -0,0 +1,2429 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H + +/* + * bcachefs on disk data structures + * + * OVERVIEW: + * + * There are three main types of on disk data structures in bcachefs (this is + * reduced from 5 in bcache) + * + * - superblock + * - journal + * - btree + * + * The btree is the primary structure; most metadata exists as keys in the + * various btrees. There are only a small number of btrees, they're not + * sharded - we have one btree for extents, another for inodes, et cetera. + * + * SUPERBLOCK: + * + * The superblock contains the location of the journal, the list of devices in + * the filesystem, and in general any metadata we need in order to decide + * whether we can start a filesystem or prior to reading the journal/btree + * roots. + * + * The superblock is extensible, and most of the contents of the superblock are + * in variable length, type tagged fields; see struct bch_sb_field. + * + * Backup superblocks do not reside in a fixed location; also, superblocks do + * not have a fixed size. To locate backup superblocks we have struct + * bch_sb_layout; we store a copy of this inside every superblock, and also + * before the first superblock. + * + * JOURNAL: + * + * The journal primarily records btree updates in the order they occurred; + * journal replay consists of just iterating over all the keys in the open + * journal entries and re-inserting them into the btrees. + * + * The journal also contains entry types for the btree roots, and blacklisted + * journal sequence numbers (see journal_seq_blacklist.c). + * + * BTREE: + * + * bcachefs btrees are copy on write b+ trees, where nodes are big (typically + * 128k-256k) and log structured. We use struct btree_node for writing the first + * entry in a given node (offset 0), and struct btree_node_entry for all + * subsequent writes. + * + * After the header, btree node entries contain a list of keys in sorted order. + * Values are stored inline with the keys; since values are variable length (and + * keys effectively are variable length too, due to packing) we can't do random + * access without building up additional in memory tables in the btree node read + * path. + * + * BTREE KEYS (struct bkey): + * + * The various btrees share a common format for the key - so as to avoid + * switching in fastpath lookup/comparison code - but define their own + * structures for the key values. + * + * The size of a key/value pair is stored as a u8 in units of u64s, so the max + * size is just under 2k. The common part also contains a type tag for the + * value, and a format field indicating whether the key is packed or not (and + * also meant to allow adding new key fields in the future, if desired). + * + * bkeys, when stored within a btree node, may also be packed. In that case, the + * bkey_format in that node is used to unpack it. Packed bkeys mean that we can + * be generous with field sizes in the common part of the key format (64 bit + * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. + */ + +#include <asm/types.h> +#include <asm/byteorder.h> +#include <linux/kernel.h> +#include <linux/uuid.h> +#include "vstructs.h" + +#ifdef __KERNEL__ +typedef uuid_t __uuid_t; +#endif + +#define BITMASK(name, type, field, offset, end) \ +static const __maybe_unused unsigned name##_OFFSET = offset; \ +static const __maybe_unused unsigned name##_BITS = (end - offset); \ + \ +static inline __u64 name(const type *k) \ +{ \ + return (k->field >> offset) & ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << (end - offset)) << offset); \ + k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ +} + +#define LE_BITMASK(_bits, name, type, field, offset, end) \ +static const __maybe_unused unsigned name##_OFFSET = offset; \ +static const __maybe_unused unsigned name##_BITS = (end - offset); \ +static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\ + \ +static inline __u64 name(const type *k) \ +{ \ + return (__le##_bits##_to_cpu(k->field) >> offset) & \ + ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + __u##_bits new = __le##_bits##_to_cpu(k->field); \ + \ + new &= ~(~(~0ULL << (end - offset)) << offset); \ + new |= (v & ~(~0ULL << (end - offset))) << offset; \ + k->field = __cpu_to_le##_bits(new); \ +} + +#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) +#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) +#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) + +struct bkey_format { + __u8 key_u64s; + __u8 nr_fields; + /* One unused slot for now: */ + __u8 bits_per_field[6]; + __le64 field_offset[6]; +}; + +/* Btree keys - all units are in sectors */ + +struct bpos { + /* + * Word order matches machine byte order - btree code treats a bpos as a + * single large integer, for search/comparison purposes + * + * Note that wherever a bpos is embedded in another on disk data + * structure, it has to be byte swabbed when reading in metadata that + * wasn't written in native endian order: + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + __u32 snapshot; + __u64 offset; + __u64 inode; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + __u64 inode; + __u64 offset; /* Points to end of extent - sectors */ + __u32 snapshot; +#else +#error edit for your odd byteorder. +#endif +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +__aligned(4) +#endif +; + +#define KEY_INODE_MAX ((__u64)~0ULL) +#define KEY_OFFSET_MAX ((__u64)~0ULL) +#define KEY_SNAPSHOT_MAX ((__u32)~0U) +#define KEY_SIZE_MAX ((__u32)~0U) + +static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) +{ + return (struct bpos) { + .inode = inode, + .offset = offset, + .snapshot = snapshot, + }; +} + +#define POS_MIN SPOS(0, 0, 0) +#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) +#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) +#define POS(_inode, _offset) SPOS(_inode, _offset, 0) + +/* Empty placeholder struct, for container_of() */ +struct bch_val { + __u64 __nothing[0]; +}; + +struct bversion { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + __u64 lo; + __u32 hi; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + __u32 hi; + __u64 lo; +#endif +} __packed __aligned(4); + +struct bkey { + /* Size of combined key and value, in u64s */ + __u8 u64s; + + /* Format of key (0 for format local to btree node) */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#else +#error edit for your odd byteorder. +#endif + + /* Type of the value */ + __u8 type; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + __u8 pad[1]; + + struct bversion version; + __u32 size; /* extent size, in sectors */ + struct bpos p; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + struct bpos p; + __u32 size; /* extent size, in sectors */ + struct bversion version; + + __u8 pad[1]; +#endif +} __packed __aligned(8); + +struct bkey_packed { + __u64 _data[0]; + + /* Size of combined key and value, in u64s */ + __u8 u64s; + + /* Format of key (0 for format local to btree node) */ + + /* + * XXX: next incompat on disk format change, switch format and + * needs_whiteout - bkey_packed() will be cheaper if format is the high + * bits of the bitfield + */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#endif + + /* Type of the value */ + __u8 type; + __u8 key_start[0]; + + /* + * We copy bkeys with struct assignment in various places, and while + * that shouldn't be done with packed bkeys we can't disallow it in C, + * and it's legal to cast a bkey to a bkey_packed - so padding it out + * to the same size as struct bkey should hopefully be safest. + */ + __u8 pad[sizeof(struct bkey) - 3]; +} __packed __aligned(8); + +typedef struct { + __le64 lo; + __le64 hi; +} bch_le128; + +#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) +#define BKEY_U64s_MAX U8_MAX +#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) + +#define KEY_PACKED_BITS_START 24 + +#define KEY_FORMAT_LOCAL_BTREE 0 +#define KEY_FORMAT_CURRENT 1 + +enum bch_bkey_fields { + BKEY_FIELD_INODE, + BKEY_FIELD_OFFSET, + BKEY_FIELD_SNAPSHOT, + BKEY_FIELD_SIZE, + BKEY_FIELD_VERSION_HI, + BKEY_FIELD_VERSION_LO, + BKEY_NR_FIELDS, +}; + +#define bkey_format_field(name, field) \ + [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) + +#define BKEY_FORMAT_CURRENT \ +((struct bkey_format) { \ + .key_u64s = BKEY_U64s, \ + .nr_fields = BKEY_NR_FIELDS, \ + .bits_per_field = { \ + bkey_format_field(INODE, p.inode), \ + bkey_format_field(OFFSET, p.offset), \ + bkey_format_field(SNAPSHOT, p.snapshot), \ + bkey_format_field(SIZE, size), \ + bkey_format_field(VERSION_HI, version.hi), \ + bkey_format_field(VERSION_LO, version.lo), \ + }, \ +}) + +/* bkey with inline value */ +struct bkey_i { + __u64 _data[0]; + + struct bkey k; + struct bch_val v; +}; + +#define KEY(_inode, _offset, _size) \ +((struct bkey) { \ + .u64s = BKEY_U64s, \ + .format = KEY_FORMAT_CURRENT, \ + .p = POS(_inode, _offset), \ + .size = _size, \ +}) + +static inline void bkey_init(struct bkey *k) +{ + *k = KEY(0, 0, 0); +} + +#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) + +#define __BKEY_PADDED(key, pad) \ + struct bkey_i key; __u64 key ## _pad[pad] + +/* + * - DELETED keys are used internally to mark keys that should be ignored but + * override keys in composition order. Their version number is ignored. + * + * - DISCARDED keys indicate that the data is all 0s because it has been + * discarded. DISCARDs may have a version; if the version is nonzero the key + * will be persistent, otherwise the key will be dropped whenever the btree + * node is rewritten (like DELETED keys). + * + * - ERROR: any read of the data returns a read error, as the data was lost due + * to a failing device. Like DISCARDED keys, they can be removed (overridden) + * by new writes or cluster-wide GC. Node repair can also overwrite them with + * the same or a more recent version number, but not with an older version + * number. + * + * - WHITEOUT: for hash table btrees + */ +#define BCH_BKEY_TYPES() \ + x(deleted, 0) \ + x(whiteout, 1) \ + x(error, 2) \ + x(cookie, 3) \ + x(hash_whiteout, 4) \ + x(btree_ptr, 5) \ + x(extent, 6) \ + x(reservation, 7) \ + x(inode, 8) \ + x(inode_generation, 9) \ + x(dirent, 10) \ + x(xattr, 11) \ + x(alloc, 12) \ + x(quota, 13) \ + x(stripe, 14) \ + x(reflink_p, 15) \ + x(reflink_v, 16) \ + x(inline_data, 17) \ + x(btree_ptr_v2, 18) \ + x(indirect_inline_data, 19) \ + x(alloc_v2, 20) \ + x(subvolume, 21) \ + x(snapshot, 22) \ + x(inode_v2, 23) \ + x(alloc_v3, 24) \ + x(set, 25) \ + x(lru, 26) \ + x(alloc_v4, 27) \ + x(backpointer, 28) \ + x(inode_v3, 29) \ + x(bucket_gens, 30) \ + x(snapshot_tree, 31) \ + x(logged_op_truncate, 32) \ + x(logged_op_finsert, 33) + +enum bch_bkey_type { +#define x(name, nr) KEY_TYPE_##name = nr, + BCH_BKEY_TYPES() +#undef x + KEY_TYPE_MAX, +}; + +struct bch_deleted { + struct bch_val v; +}; + +struct bch_whiteout { + struct bch_val v; +}; + +struct bch_error { + struct bch_val v; +}; + +struct bch_cookie { + struct bch_val v; + __le64 cookie; +}; + +struct bch_hash_whiteout { + struct bch_val v; +}; + +struct bch_set { + struct bch_val v; +}; + +/* Extents */ + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +/* 128 bits, sufficient for cryptographic MACs: */ +struct bch_csum { + __le64 lo; + __le64 hi; +} __packed __aligned(8); + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 + +enum bch_extent_entry_type { +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:13, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:4; +#endif + struct bch_csum csum; +} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + unused:1, + unwritten:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + unwritten:1, + unused:1, + cached:1, + type:1; +#endif +} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, + redundancy:4, + idx:47; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:47, + redundancy:4, + block:8, + type:5; +#endif +}; + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:34, + compression:8, /* enum bch_compression_opt */ + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:34, + type:6; +#endif +}; + +union bch_extent_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +struct bch_btree_ptr { + struct bch_val v; + + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + __le16 flags; + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + +struct bch_extent { + struct bch_val v; + + __u64 _data[0]; + union bch_extent_entry start[]; +} __packed __aligned(8); + +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +/* Inodes */ + +#define BLOCKDEV_INODE_MAX 4096 + +#define BCACHEFS_ROOT_INO 4096 + +struct bch_inode { + struct bch_val v; + + __le64 bi_hash_seed; + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[]; +} __packed __aligned(8); + +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[]; +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; +} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + +#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ + x(nocow, 8) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; + +#define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ + x(append, 2) \ + x(nodump, 3) \ + x(noatime, 4) \ + x(i_size_dirty, 5) \ + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) + +/* bits 20+ reserved for packed fields below: */ + +enum bch_inode_flags { +#define x(t, n) BCH_INODE_##t = 1U << n, + BCH_INODE_FLAGS() +#undef x +}; + +enum __bch_inode_flags { +#define x(t, n) __BCH_INODE_##t = n, + BCH_INODE_FLAGS() +#undef x +}; + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + +/* Dirents */ + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + union { + __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + +#define BCH_NAME_MAX 512 + +/* Xattrs */ + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __packed __aligned(8); + +/* Bucket/allocation information: */ + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + __u64 fragmentation_lru; +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 + +struct bch_backpointer { + struct bch_val v; + __u8 btree_id; + __u8 level; + __u8 data_type; + __u64 bucket_offset:40; + __u32 bucket_len; + struct bpos pos; +} __packed __aligned(8); + +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + +/* Quotas: */ + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __packed __aligned(8); + +/* Erasure coding */ + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[]; +} __packed __aligned(8); + +/* Reflink: */ + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[]; +} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[]; +}; + +/* Inline data */ + +struct bch_inline_data { + struct bch_val v; + u8 data[]; +}; + +/* Subvolumes: */ + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; + /* + * Snapshot subvolumes form a tree, separate from the snapshot nodes + * tree - if this subvolume is a snapshot, this is the ID of the + * subvolume it was created from: + */ + __le32 parent; + __le32 pad; + bch_le128 otime; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +/* Snapshots */ + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ + __le32 tree; + __le32 depth; + __le32 skip[3]; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + +/* LRU btree: */ + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __packed __aligned(8); + +#define LRU_ID_STRIPES (1U << 16) + +/* Logged operations btree: */ + +struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; + __le32 pad; + __le64 inum; + __le64 new_i_size; +}; + +enum logged_op_finsert_state { + LOGGED_OP_FINSERT_start, + LOGGED_OP_FINSERT_shift_extents, + LOGGED_OP_FINSERT_finish, +}; + +struct bch_logged_op_finsert { + struct bch_val v; + __u8 state; + __u8 pad[3]; + __le32 subvol; + __le64 inum; + __le64 dst_offset; + __le64 src_offset; + __le64 pos; +}; + +/* Optional/variable size superblock sections: */ + +struct bch_sb_field { + __u64 _data[0]; + __le32 u64s; + __le32 type; +}; + +#define BCH_SB_FIELDS() \ + x(journal, 0) \ + x(members_v1, 1) \ + x(crypt, 2) \ + x(replicas_v0, 3) \ + x(quota, 4) \ + x(disk_groups, 5) \ + x(clean, 6) \ + x(replicas, 7) \ + x(journal_seq_blacklist, 8) \ + x(journal_v2, 9) \ + x(counters, 10) \ + x(members_v2, 11) \ + x(errors, 12) + +enum bch_sb_field_type { +#define x(f, nr) BCH_SB_FIELD_##f = nr, + BCH_SB_FIELDS() +#undef x + BCH_SB_FIELD_NR +}; + +/* + * Most superblock fields are replicated in all device's superblocks - a few are + * not: + */ +#define BCH_SINGLE_DEVICE_SB_FIELDS \ + ((1U << BCH_SB_FIELD_journal)| \ + (1U << BCH_SB_FIELD_journal_v2)) + +/* BCH_SB_FIELD_journal: */ + +struct bch_sb_field_journal { + struct bch_sb_field field; + __le64 buckets[]; +}; + +struct bch_sb_field_journal_v2 { + struct bch_sb_field field; + + struct bch_sb_field_journal_v2_entry { + __le64 start; + __le64 nr; + } d[]; +}; + +/* BCH_SB_FIELD_members_v1: */ + +#define BCH_MIN_NR_NBUCKETS (1 << 6) + +#define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ + x(seqwrite, 1) \ + x(randread, 2) \ + x(randwrite, 3) + +enum bch_iops_measurement { +#define x(t, n) BCH_IOPS_##t = n, + BCH_IOPS_MEASUREMENTS() +#undef x + BCH_IOPS_NR +}; + +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __le32 pad; + __le64 last_mount; /* time_t */ + + __le64 flags; + __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; +}; + +#define BCH_MEMBER_V1_BYTES 56 + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags, 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + +enum bch_member_state { +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR +}; + +struct bch_sb_field_members_v1 { + struct bch_sb_field field; + struct bch_member _members[]; //Members are now variable size +}; + +struct bch_sb_field_members_v2 { + struct bch_sb_field field; + __le16 member_bytes; //size of single member entry + u8 pad[6]; + struct bch_member _members[]; +}; + +/* BCH_SB_FIELD_crypt: */ + +struct nonce { + __le32 d[4]; +}; + +struct bch_key { + __le64 key[4]; +}; + +#define BCH_KEY_MAGIC \ + (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ + ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ + ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ + ((__u64) 'e' << 48)|((__u64) 'y' << 56)) + +struct bch_encrypted_key { + __le64 magic; + struct bch_key key; +}; + +/* + * If this field is present in the superblock, it stores an encryption key which + * is used encrypt all other data/metadata. The key will normally be encrypted + * with the key userspace provides, but if encryption has been turned off we'll + * just store the master key unencrypted in the superblock so we can access the + * previously encrypted data. + */ +struct bch_sb_field_crypt { + struct bch_sb_field field; + + __le64 flags; + __le64 kdf_flags; + struct bch_encrypted_key key; +}; + +LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); + +enum bch_kdf_types { + BCH_KDF_SCRYPT = 0, + BCH_KDF_NR = 1, +}; + +/* stored as base 2 log of scrypt params: */ +LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); +LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); +LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); + +/* BCH_SB_FIELD_replicas: */ + +#define BCH_DATA_TYPES() \ + x(free, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) \ + x(parity, 6) \ + x(stripe, 7) \ + x(need_gc_gens, 8) \ + x(need_discard, 9) + +enum bch_data_type { +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR +}; + +static inline bool data_type_is_empty(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + return true; + default: + return false; + } +} + +static inline bool data_type_is_hidden(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_sb: + case BCH_DATA_journal: + return true; + default: + return false; + } +} + +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[]; +} __packed __aligned(8); + +struct bch_replicas_entry { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[]; +} __packed; + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry entries[]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_disk_groups: */ + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_counters */ + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_fail, 38) \ + x(move_extent_start_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[]; +}; + +/* + * On clean shutdown, store btree roots and current journal sequence number in + * the superblock: + */ +struct jset_entry { + __le16 u64s; + __u8 btree_id; + __u8 level; + __u8 type; /* designates what this jset holds */ + __u8 pad[3]; + + struct bkey_i start[0]; + __u64 _data[]; +}; + +struct bch_sb_field_clean { + struct bch_sb_field field; + + __le32 flags; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; + __le64 journal_seq; + + struct jset_entry start[0]; + __u64 _data[]; +}; + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; + +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; +}; + +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + +/* Superblock: */ + +/* + * New versioning scheme: + * One common version number for all on disk data structures - superblock, btree + * nodes, journal entries + */ +#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) +#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) +#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) + +#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) + +#define BCH_METADATA_VERSIONS() \ + x(bkey_renumber, BCH_VERSION(0, 10), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_btree_change, BCH_VERSION(0, 11), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot, BCH_VERSION(0, 12), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_backpointers, BCH_VERSION(0, 13), \ + RECOVERY_PASS_ALL_FSCK) \ + x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_2, BCH_VERSION(0, 15), \ + BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ + BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(reflink_p_fix, BCH_VERSION(0, 16), \ + BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ + x(subvol_dirent, BCH_VERSION(0, 17), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v2, BCH_VERSION(0, 18), \ + RECOVERY_PASS_ALL_FSCK) \ + x(freespace, BCH_VERSION(0, 19), \ + RECOVERY_PASS_ALL_FSCK) \ + x(alloc_v4, BCH_VERSION(0, 20), \ + RECOVERY_PASS_ALL_FSCK) \ + x(new_data_types, BCH_VERSION(0, 21), \ + RECOVERY_PASS_ALL_FSCK) \ + x(backpointers, BCH_VERSION(0, 22), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v3, BCH_VERSION(0, 23), \ + RECOVERY_PASS_ALL_FSCK) \ + x(unwritten_extents, BCH_VERSION(0, 24), \ + RECOVERY_PASS_ALL_FSCK) \ + x(bucket_gens, BCH_VERSION(0, 25), \ + BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(lru_v2, BCH_VERSION(0, 26), \ + RECOVERY_PASS_ALL_FSCK) \ + x(fragmentation_lru, BCH_VERSION(0, 27), \ + RECOVERY_PASS_ALL_FSCK) \ + x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_trees, BCH_VERSION(0, 29), \ + RECOVERY_PASS_ALL_FSCK) \ + x(major_minor, BCH_VERSION(1, 0), \ + 0) \ + x(snapshot_skiplists, BCH_VERSION(1, 1), \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ + x(deleted_inodes, BCH_VERSION(1, 2), \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ + x(rebalance_work, BCH_VERSION(1, 3), \ + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + +enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, +#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, + BCH_METADATA_VERSIONS() +#undef x + bcachefs_metadata_version_max +}; + +static const __maybe_unused +unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work; + +#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) + +#define BCH_SB_SECTOR 8 +#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ + +struct bch_sb_layout { + __uuid_t magic; /* bcachefs superblock UUID */ + __u8 layout_type; + __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ + __u8 nr_superblocks; + __u8 pad[5]; + __le64 sb_offset[61]; +} __packed __aligned(8); + +#define BCH_SB_LAYOUT_SECTOR 7 + +/* + * @offset - sector where this sb was written + * @version - on disk format version + * @version_min - Oldest metadata version this filesystem contains; so we can + * safely drop compatibility code and refuse to mount filesystems + * we'd need it for + * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) + * @seq - incremented each time superblock is written + * @uuid - used for generating various magic numbers and identifying + * member devices, never changes + * @user_uuid - user visible UUID, may be changed + * @label - filesystem label + * @seq - identifies most recent superblock, incremented each time + * superblock is written + * @features - enabled incompatible features + */ +struct bch_sb { + struct bch_csum csum; + __le16 version; + __le16 version_min; + __le16 pad[2]; + __uuid_t magic; + __uuid_t uuid; + __uuid_t user_uuid; + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 offset; + __le64 seq; + + __le16 block_size; + __u8 dev_idx; + __u8 nr_devices; + __le32 u64s; + + __le64 time_base_lo; + __le32 time_base_hi; + __le32 time_precision; + + __le64 flags[8]; + __le64 features[2]; + __le64 compat[2]; + + struct bch_sb_layout layout; + + struct bch_sb_field start[0]; + __le64 _data[]; +} __packed __aligned(8); + +/* + * Flags: + * BCH_SB_INITALIZED - set on first mount + * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect + * behaviour of mount/recovery path: + * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits + * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 + * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides + * DATA/META_CSUM_TYPE. Also indicates encryption + * algorithm in use, if/when we get more than one + */ + +LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); + +LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); +LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); +LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); +LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); + +LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); + +LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); +LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); + +LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); +LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); + +LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); + +LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); +LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); +LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); +LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); + +LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); +LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); + +LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); + +LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); +LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); + +LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); +LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); + +/* + * Max size of an extent that may require bouncing to read or write + * (checksummed, compressed): 64k + */ +LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, + struct bch_sb, flags[1], 14, 20); + +LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); + +LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); +LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); +LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); + +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, + struct bch_sb, flags[2], 0, 4); +LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + +LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); +LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); +LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); +LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); +LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); +LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); +LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); +LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); +LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); +LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); + +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, + struct bch_sb, flags[4], 60, 64); + +LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, + struct bch_sb, flags[5], 0, 16); + +static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) +{ + return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); +} + +static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) +{ + SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); + SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); +} + +static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) +{ + return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | + (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); +} + +static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) +{ + SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); + SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); +} + +/* + * Features: + * + * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist + * reflink: gates KEY_TYPE_reflink + * inline_data: gates KEY_TYPE_inline_data + * new_siphash: gates BCH_STR_HASH_siphash + * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE + */ +#define BCH_SB_FEATURES() \ + x(lz4, 0) \ + x(gzip, 1) \ + x(zstd, 2) \ + x(atomic_nlink, 3) \ + x(ec, 4) \ + x(journal_seq_blacklist_v3, 5) \ + x(reflink, 6) \ + x(new_siphash, 7) \ + x(inline_data, 8) \ + x(new_extent_overwrite, 9) \ + x(incompressible, 10) \ + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) \ + x(btree_updates_journalled, 13) \ + x(reflink_inline_data, 14) \ + x(new_varint, 15) \ + x(journal_no_flush, 16) \ + x(alloc_v2, 17) \ + x(extents_across_btree_nodes, 18) + +#define BCH_SB_FEATURES_ALWAYS \ + ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_btree_updates_journalled)|\ + (1ULL << BCH_FEATURE_alloc_v2)|\ + (1ULL << BCH_FEATURE_extents_across_btree_nodes)) + +#define BCH_SB_FEATURES_ALL \ + (BCH_SB_FEATURES_ALWAYS| \ + (1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)) + +enum bch_sb_feature { +#define x(f, n) BCH_FEATURE_##f, + BCH_SB_FEATURES() +#undef x + BCH_FEATURE_NR, +}; + +#define BCH_SB_COMPAT() \ + x(alloc_info, 0) \ + x(alloc_metadata, 1) \ + x(extents_above_btree_updates_done, 2) \ + x(bformat_overflow_done, 3) + +enum bch_sb_compat { +#define x(f, n) BCH_COMPAT_##f, + BCH_SB_COMPAT() +#undef x + BCH_COMPAT_NR, +}; + +/* options: */ + +#define BCH_VERSION_UPGRADE_OPTS() \ + x(compatible, 0) \ + x(incompatible, 1) \ + x(none, 2) + +enum bch_version_upgrade_opts { +#define x(t, n) BCH_VERSION_UPGRADE_##t = n, + BCH_VERSION_UPGRADE_OPTS() +#undef x +}; + +#define BCH_REPLICAS_MAX 4U + +#define BCH_BKEY_PTRS_MAX 16U + +#define BCH_ERROR_ACTIONS() \ + x(continue, 0) \ + x(ro, 1) \ + x(panic, 2) + +enum bch_error_actions { +#define x(t, n) BCH_ON_ERROR_##t = n, + BCH_ERROR_ACTIONS() +#undef x + BCH_ON_ERROR_NR +}; + +#define BCH_STR_HASH_TYPES() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash_old, 2) \ + x(siphash, 3) + +enum bch_str_hash_type { +#define x(t, n) BCH_STR_HASH_##t = n, + BCH_STR_HASH_TYPES() +#undef x + BCH_STR_HASH_NR +}; + +#define BCH_STR_HASH_OPTS() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash, 2) + +enum bch_str_hash_opts { +#define x(t, n) BCH_STR_HASH_OPT_##t = n, + BCH_STR_HASH_OPTS() +#undef x + BCH_STR_HASH_OPT_NR +}; + +#define BCH_CSUM_TYPES() \ + x(none, 0) \ + x(crc32c_nonzero, 1) \ + x(crc64_nonzero, 2) \ + x(chacha20_poly1305_80, 3) \ + x(chacha20_poly1305_128, 4) \ + x(crc32c, 5) \ + x(crc64, 6) \ + x(xxhash, 7) + +enum bch_csum_type { +#define x(t, n) BCH_CSUM_##t = n, + BCH_CSUM_TYPES() +#undef x + BCH_CSUM_NR +}; + +static const __maybe_unused unsigned bch_crc_bytes[] = { + [BCH_CSUM_none] = 0, + [BCH_CSUM_crc32c_nonzero] = 4, + [BCH_CSUM_crc32c] = 4, + [BCH_CSUM_crc64_nonzero] = 8, + [BCH_CSUM_crc64] = 8, + [BCH_CSUM_xxhash] = 8, + [BCH_CSUM_chacha20_poly1305_80] = 10, + [BCH_CSUM_chacha20_poly1305_128] = 16, +}; + +static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) +{ + switch (type) { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: + return true; + default: + return false; + } +} + +#define BCH_CSUM_OPTS() \ + x(none, 0) \ + x(crc32c, 1) \ + x(crc64, 2) \ + x(xxhash, 3) + +enum bch_csum_opts { +#define x(t, n) BCH_CSUM_OPT_##t = n, + BCH_CSUM_OPTS() +#undef x + BCH_CSUM_OPT_NR +}; + +#define BCH_COMPRESSION_TYPES() \ + x(none, 0) \ + x(lz4_old, 1) \ + x(gzip, 2) \ + x(lz4, 3) \ + x(zstd, 4) \ + x(incompressible, 5) + +enum bch_compression_type { +#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, + BCH_COMPRESSION_TYPES() +#undef x + BCH_COMPRESSION_TYPE_NR +}; + +#define BCH_COMPRESSION_OPTS() \ + x(none, 0) \ + x(lz4, 1) \ + x(gzip, 2) \ + x(zstd, 3) + +enum bch_compression_opts { +#define x(t, n) BCH_COMPRESSION_OPT_##t = n, + BCH_COMPRESSION_OPTS() +#undef x + BCH_COMPRESSION_OPT_NR +}; + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define BCACHE_MAGIC \ + UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \ + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) +#define BCHFS_MAGIC \ + UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) + +#define BCACHEFS_STATFS_MAGIC 0xca451a4e + +#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) +#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) + +static inline __le64 __bch2_sb_magic(struct bch_sb *sb) +{ + __le64 ret; + + memcpy(&ret, &sb->uuid, sizeof(ret)); + return ret; +} + +static inline __u64 __jset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); +} + +static inline __u64 __bset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); +} + +/* Journal */ + +#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) + +#define BCH_JSET_ENTRY_TYPES() \ + x(btree_keys, 0) \ + x(btree_root, 1) \ + x(prio_ptrs, 2) \ + x(blacklist, 3) \ + x(blacklist_v2, 4) \ + x(usage, 5) \ + x(data_usage, 6) \ + x(clock, 7) \ + x(dev_usage, 8) \ + x(log, 9) \ + x(overwrite, 10) + +enum { +#define x(f, nr) BCH_JSET_ENTRY_##f = nr, + BCH_JSET_ENTRY_TYPES() +#undef x + BCH_JSET_ENTRY_NR +}; + +/* + * Journal sequence numbers can be blacklisted: bsets record the max sequence + * number of all the journal entries they contain updates for, so that on + * recovery we can ignore those bsets that contain index updates newer that what + * made it into the journal. + * + * This means that we can't reuse that journal_seq - we have to skip it, and + * then record that we skipped it so that the next time we crash and recover we + * don't think there was a missing journal entry. + */ +struct jset_entry_blacklist { + struct jset_entry entry; + __le64 seq; +}; + +struct jset_entry_blacklist_v2 { + struct jset_entry entry; + __le64 start; + __le64 end; +}; + +#define BCH_FS_USAGE_TYPES() \ + x(reserved, 0) \ + x(inodes, 1) \ + x(key_version, 2) + +enum { +#define x(f, nr) BCH_FS_USAGE_##f = nr, + BCH_FS_USAGE_TYPES() +#undef x + BCH_FS_USAGE_NR +}; + +struct jset_entry_usage { + struct jset_entry entry; + __le64 v; +} __packed; + +struct jset_entry_data_usage { + struct jset_entry entry; + __le64 v; + struct bch_replicas_entry r; +} __packed; + +struct jset_entry_clock { + struct jset_entry entry; + __u8 rw; + __u8 pad[7]; + __le64 time; +} __packed; + +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; +} __packed; + +struct jset_entry_dev_usage { + struct jset_entry entry; + __le32 dev; + __u32 pad; + + __le64 buckets_ec; + __le64 _buckets_unavailable; /* No longer used */ + + struct jset_entry_dev_usage_type d[]; +}; + +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) +{ + return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); +} + +struct jset_entry_log { + struct jset_entry entry; + u8 d[]; +} __packed; + +/* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ +struct jset { + struct bch_csum csum; + + __le64 magic; + __le64 seq; + __le32 version; + __le32 flags; + + __le32 u64s; /* size of d[] in u64s */ + + __u8 encrypted_start[0]; + + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; + + /* Sequence number of oldest dirty journal entry */ + __le64 last_seq; + + + struct jset_entry start[0]; + __u64 _data[]; +} __packed __aligned(8); + +LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); +LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); +LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + +#define BCH_JOURNAL_BUCKETS_MIN 8 + +/* Btree: */ + +enum btree_id_flags { + BTREE_ID_EXTENTS = BIT(0), + BTREE_ID_SNAPSHOTS = BIT(1), + BTREE_ID_SNAPSHOT_FIELD = BIT(2), + BTREE_ID_DATA = BIT(3), +}; + +#define BCH_BTREE_IDS() \ + x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_error)| \ + BIT_ULL(KEY_TYPE_cookie)| \ + BIT_ULL(KEY_TYPE_extent)| \ + BIT_ULL(KEY_TYPE_reservation)| \ + BIT_ULL(KEY_TYPE_reflink_p)| \ + BIT_ULL(KEY_TYPE_inline_data)) \ + x(inodes, 1, BTREE_ID_SNAPSHOTS, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_inode)| \ + BIT_ULL(KEY_TYPE_inode_v2)| \ + BIT_ULL(KEY_TYPE_inode_v3)| \ + BIT_ULL(KEY_TYPE_inode_generation)) \ + x(dirents, 2, BTREE_ID_SNAPSHOTS, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_hash_whiteout)| \ + BIT_ULL(KEY_TYPE_dirent)) \ + x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_cookie)| \ + BIT_ULL(KEY_TYPE_hash_whiteout)| \ + BIT_ULL(KEY_TYPE_xattr)) \ + x(alloc, 4, 0, \ + BIT_ULL(KEY_TYPE_alloc)| \ + BIT_ULL(KEY_TYPE_alloc_v2)| \ + BIT_ULL(KEY_TYPE_alloc_v3)| \ + BIT_ULL(KEY_TYPE_alloc_v4)) \ + x(quotas, 5, 0, \ + BIT_ULL(KEY_TYPE_quota)) \ + x(stripes, 6, 0, \ + BIT_ULL(KEY_TYPE_stripe)) \ + x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ + BIT_ULL(KEY_TYPE_reflink_v)| \ + BIT_ULL(KEY_TYPE_indirect_inline_data)) \ + x(subvolumes, 8, 0, \ + BIT_ULL(KEY_TYPE_subvolume)) \ + x(snapshots, 9, 0, \ + BIT_ULL(KEY_TYPE_snapshot)) \ + x(lru, 10, 0, \ + BIT_ULL(KEY_TYPE_set)) \ + x(freespace, 11, BTREE_ID_EXTENTS, \ + BIT_ULL(KEY_TYPE_set)) \ + x(need_discard, 12, 0, \ + BIT_ULL(KEY_TYPE_set)) \ + x(backpointers, 13, 0, \ + BIT_ULL(KEY_TYPE_backpointer)) \ + x(bucket_gens, 14, 0, \ + BIT_ULL(KEY_TYPE_bucket_gens)) \ + x(snapshot_trees, 15, 0, \ + BIT_ULL(KEY_TYPE_snapshot_tree)) \ + x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_set)) \ + x(logged_ops, 17, 0, \ + BIT_ULL(KEY_TYPE_logged_op_truncate)| \ + BIT_ULL(KEY_TYPE_logged_op_finsert)) \ + x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) + +enum btree_id { +#define x(name, nr, ...) BTREE_ID_##name = nr, + BCH_BTREE_IDS() +#undef x + BTREE_ID_NR +}; + +#define BTREE_MAX_DEPTH 4U + +/* Btree nodes */ + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __le64 seq; + + /* + * Highest journal entry this bset contains keys for. + * If on recovery we don't see that journal entry, this bset is ignored: + * this allows us to preserve the order of all index updates after a + * crash, since the journal records a total order of all index updates + * and anything that didn't make it to the journal doesn't get used. + */ + __le64 journal_seq; + + __le32 flags; + __le16 version; + __le16 u64s; /* count of d[] in u64s */ + + struct bkey_packed start[0]; + __u64 _data[]; +} __packed __aligned(8); + +LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); + +LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); +LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, + struct bset, flags, 5, 6); + +/* Sector offset within the btree node: */ +LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); + +struct btree_node { + struct bch_csum csum; + __le64 magic; + + /* this flags field is encrypted, unlike bset->flags: */ + __le64 flags; + + /* Closed interval: */ + struct bpos min_key; + struct bpos max_key; + struct bch_extent_ptr _ptr; /* not used anymore */ + struct bkey_format format; + + union { + struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + + }; + }; +} __packed __aligned(8); + +LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4); +LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); +LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, + struct btree_node, flags, 8, 9); +LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25); +/* 25-32 unused */ +LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); + +static inline __u64 BTREE_NODE_ID(struct btree_node *n) +{ + return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); +} + +static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) +{ + SET_BTREE_NODE_ID_LO(n, v); + SET_BTREE_NODE_ID_HI(n, v >> 4); +} + +struct btree_node_entry { + struct bch_csum csum; + + union { + struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + }; + }; +} __packed __aligned(8); + +#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h new file mode 100644 index 000000000000..f05881f7e113 --- /dev/null +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IOCTL_H +#define _BCACHEFS_IOCTL_H + +#include <linux/uuid.h> +#include <asm/ioctl.h> +#include "bcachefs_format.h" + +/* + * Flags common to multiple ioctls: + */ +#define BCH_FORCE_IF_DATA_LOST (1 << 0) +#define BCH_FORCE_IF_METADATA_LOST (1 << 1) +#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) +#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) + +#define BCH_FORCE_IF_LOST \ + (BCH_FORCE_IF_DATA_LOST| \ + BCH_FORCE_IF_METADATA_LOST) +#define BCH_FORCE_IF_DEGRADED \ + (BCH_FORCE_IF_DATA_DEGRADED| \ + BCH_FORCE_IF_METADATA_DEGRADED) + +/* + * If cleared, ioctl that refer to a device pass it as a pointer to a pathname + * (e.g. /dev/sda1); if set, the dev field is the device's index within the + * filesystem: + */ +#define BCH_BY_INDEX (1 << 4) + +/* + * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem + * wide superblock: + */ +#define BCH_READ_DEV (1 << 5) + +/* global control dev: */ + +/* These are currently broken, and probably unnecessary: */ +#if 0 +#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) +#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) + +struct bch_ioctl_assemble { + __u32 flags; + __u32 nr_devs; + __u64 pad; + __u64 devs[]; +}; + +struct bch_ioctl_incremental { + __u32 flags; + __u64 pad; + __u64 dev; +}; +#endif + +/* filesystem ioctls: */ + +#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) + +/* These only make sense when we also have incremental assembly */ +#if 0 +#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) +#define BCH_IOCTL_STOP _IO(0xbc, 3) +#endif + +#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) +#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) +#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) +#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) +#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) +#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) +#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) +#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) + +#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) +#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) + +/* ioctl below act on a particular file, not the filesystem as a whole: */ + +#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) + +/* + * BCH_IOCTL_QUERY_UUID: get filesystem UUID + * + * Returns user visible UUID, not internal UUID (which may not ever be changed); + * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with + * this UUID. + */ +struct bch_ioctl_query_uuid { + __uuid_t uuid; +}; + +#if 0 +struct bch_ioctl_start { + __u32 flags; + __u32 pad; +}; +#endif + +/* + * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem + * + * The specified device must not be open or in use. On success, the new device + * will be an online member of the filesystem just like any other member. + * + * The device must first be prepared by userspace by formatting with a bcachefs + * superblock, which is only used for passing in superblock options/parameters + * for that device (in struct bch_member). The new device's superblock should + * not claim to be a member of any existing filesystem - UUIDs on it will be + * ignored. + */ + +/* + * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem + * + * Any data present on @dev will be permanently deleted, and @dev will be + * removed from its slot in the filesystem's list of member devices. The device + * may be either offline or offline. + * + * Will fail removing @dev would leave us with insufficient read write devices + * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are + * set. + */ + +/* + * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem + * but is not open (e.g. because we started in degraded mode), bring it online + * + * all existing data on @dev will be available once the device is online, + * exactly as if @dev was present when the filesystem was first mounted + */ + +/* + * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that + * block device, without removing it from the filesystem (so it can be brought + * back online later) + * + * Data present on @dev will be unavailable while @dev is offline (unless + * replicated), but will still be intact and untouched if @dev is brought back + * online + * + * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would + * leave us with insufficient read write devices or degraded/unavailable data, + * unless the approprate BCH_FORCE_IF_* flags are set. + */ + +struct bch_ioctl_disk { + __u32 flags; + __u32 pad; + __u64 dev; +}; + +/* + * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem + * + * @new_state - one of the bch_member_state states (rw, ro, failed, + * spare) + * + * Will refuse to change member state if we would then have insufficient devices + * to write to, or if it would result in degraded data (when @new_state is + * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. + */ +struct bch_ioctl_disk_set_state { + __u32 flags; + __u8 new_state; + __u8 pad[3]; + __u64 dev; +}; + +enum bch_data_ops { + BCH_DATA_OP_SCRUB = 0, + BCH_DATA_OP_REREPLICATE = 1, + BCH_DATA_OP_MIGRATE = 2, + BCH_DATA_OP_REWRITE_OLD_NODES = 3, + BCH_DATA_OP_NR = 4, +}; + +/* + * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. + * scrub, rereplicate, migrate). + * + * This ioctl kicks off a job in the background, and returns a file descriptor. + * Reading from the file descriptor returns a struct bch_ioctl_data_event, + * indicating current progress, and closing the file descriptor will stop the + * job. The file descriptor is O_CLOEXEC. + */ +struct bch_ioctl_data { + __u16 op; + __u8 start_btree; + __u8 end_btree; + __u32 flags; + + struct bpos start_pos; + struct bpos end_pos; + + union { + struct { + __u32 dev; + __u32 pad; + } migrate; + struct { + __u64 pad[8]; + }; + }; +} __packed __aligned(8); + +enum bch_data_event { + BCH_DATA_EVENT_PROGRESS = 0, + /* XXX: add an event for reporting errors */ + BCH_DATA_EVENT_NR = 1, +}; + +struct bch_ioctl_data_progress { + __u8 data_type; + __u8 btree_id; + __u8 pad[2]; + struct bpos pos; + + __u64 sectors_done; + __u64 sectors_total; +} __packed __aligned(8); + +struct bch_ioctl_data_event { + __u8 type; + __u8 pad[7]; + union { + struct bch_ioctl_data_progress p; + __u64 pad2[15]; + }; +} __packed __aligned(8); + +struct bch_replicas_usage { + __u64 sectors; + struct bch_replicas_entry r; +} __packed; + +static inline struct bch_replicas_usage * +replicas_usage_next(struct bch_replicas_usage *u) +{ + return (void *) u + replicas_entry_bytes(&u->r) + 8; +} + +/* + * BCH_IOCTL_FS_USAGE: query filesystem disk space usage + * + * Returns disk space usage broken out by data type, number of replicas, and + * by component device + * + * @replica_entries_bytes - size, in bytes, allocated for replica usage entries + * + * On success, @replica_entries_bytes will be changed to indicate the number of + * bytes actually used. + * + * Returns -ERANGE if @replica_entries_bytes was too small + */ +struct bch_ioctl_fs_usage { + __u64 capacity; + __u64 used; + __u64 online_reserved; + __u64 persistent_reserved[BCH_REPLICAS_MAX]; + + __u32 replica_entries_bytes; + __u32 pad; + + struct bch_replicas_usage replicas[0]; +}; + +/* + * BCH_IOCTL_DEV_USAGE: query device disk space usage + * + * Returns disk space usage broken out by data type - both by buckets and + * sectors. + */ +struct bch_ioctl_dev_usage { + __u64 dev; + __u32 flags; + __u8 state; + __u8 pad[7]; + + __u32 bucket_size; + __u64 nr_buckets; + + __u64 buckets_ec; + + struct bch_ioctl_dev_usage_type { + __u64 buckets; + __u64 sectors; + __u64 fragmented; + } d[BCH_DATA_NR]; +}; + +/* + * BCH_IOCTL_READ_SUPER: read filesystem superblock + * + * Equivalent to reading the superblock directly from the block device, except + * avoids racing with the kernel writing the superblock or having to figure out + * which block device to read + * + * @sb - buffer to read into + * @size - size of userspace allocated buffer + * @dev - device to read superblock for, if BCH_READ_DEV flag is + * specified + * + * Returns -ERANGE if buffer provided is too small + */ +struct bch_ioctl_read_super { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 size; + __u64 sb; +}; + +/* + * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to + * determine if disk is a (online) member - if so, returns device's index + * + * Returns -ENOENT if not found + */ +struct bch_ioctl_disk_get_idx { + __u64 dev; +}; + +/* + * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device + * + * @dev - member to resize + * @nbuckets - new number of buckets + */ +struct bch_ioctl_disk_resize { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 nbuckets; +}; + +/* + * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device + * + * @dev - member to resize + * @nbuckets - new number of buckets + */ +struct bch_ioctl_disk_resize_journal { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 nbuckets; +}; + +struct bch_ioctl_subvolume { + __u32 flags; + __u32 dirfd; + __u16 mode; + __u16 pad[3]; + __u64 dst_ptr; + __u64 src_ptr; +}; + +#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) +#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) + +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c new file mode 100644 index 000000000000..abdb05507d16 --- /dev/null +++ b/fs/bcachefs/bkey.c @@ -0,0 +1,1120 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey.h" +#include "bkey_cmp.h" +#include "bkey_methods.h" +#include "bset.h" +#include "util.h" + +const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; + +void bch2_bkey_packed_to_binary_text(struct printbuf *out, + const struct bkey_format *f, + const struct bkey_packed *k) +{ + const u64 *p = high_word(f, k); + unsigned word_bits = 64 - high_bit_offset; + unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset; + u64 v = *p & (~0ULL >> high_bit_offset); + + if (!nr_key_bits) { + prt_str(out, "(empty)"); + return; + } + + while (1) { + unsigned next_key_bits = nr_key_bits; + + if (nr_key_bits < 64) { + v >>= 64 - nr_key_bits; + next_key_bits = 0; + } else { + next_key_bits -= 64; + } + + bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + + if (!next_key_bits) + break; + + prt_char(out, ' '); + + p = next_word(p); + v = *p; + word_bits = 64; + nr_key_bits = next_key_bits; + } +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +static void bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) +{ + struct bkey tmp; + + BUG_ON(bkeyp_val_u64s(format, packed) != + bkey_val_u64s(unpacked)); + + BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); + + tmp = __bch2_bkey_unpack_key(format, packed); + + if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n", + format->key_u64s, + format->bits_per_field[0], + format->bits_per_field[1], + format->bits_per_field[2], + format->bits_per_field[3], + format->bits_per_field[4]); + + prt_printf(&buf, "compiled unpack: "); + bch2_bkey_to_text(&buf, unpacked); + prt_newline(&buf); + + prt_printf(&buf, "c unpack: "); + bch2_bkey_to_text(&buf, &tmp); + prt_newline(&buf); + + prt_printf(&buf, "compiled unpack: "); + bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, + (struct bkey_packed *) unpacked); + prt_newline(&buf); + + prt_printf(&buf, "c unpack: "); + bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, + (struct bkey_packed *) &tmp); + prt_newline(&buf); + + panic("%s", buf.buf); + } +} + +#else +static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) {} +#endif + +struct pack_state { + const struct bkey_format *format; + unsigned bits; /* bits remaining in current word */ + u64 w; /* current word */ + u64 *p; /* pointer to next word */ +}; + +__always_inline +static struct pack_state pack_state_init(const struct bkey_format *format, + struct bkey_packed *k) +{ + u64 *p = high_word(format, k); + + return (struct pack_state) { + .format = format, + .bits = 64 - high_bit_offset, + .w = 0, + .p = p, + }; +} + +__always_inline +static void pack_state_finish(struct pack_state *state, + struct bkey_packed *k) +{ + EBUG_ON(state->p < k->_data); + EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s); + + *state->p = state->w; +} + +struct unpack_state { + const struct bkey_format *format; + unsigned bits; /* bits remaining in current word */ + u64 w; /* current word */ + const u64 *p; /* pointer to next word */ +}; + +__always_inline +static struct unpack_state unpack_state_init(const struct bkey_format *format, + const struct bkey_packed *k) +{ + const u64 *p = high_word(format, k); + + return (struct unpack_state) { + .format = format, + .bits = 64 - high_bit_offset, + .w = *p << high_bit_offset, + .p = p, + }; +} + +__always_inline +static u64 get_inc_field(struct unpack_state *state, unsigned field) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); + + if (bits >= state->bits) { + v = state->w >> (64 - bits); + bits -= state->bits; + + state->p = next_word(state->p); + state->w = *state->p; + state->bits = 64; + } + + /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ + v |= (state->w >> 1) >> (63 - bits); + state->w <<= bits; + state->bits -= bits; + + return v + offset; +} + +__always_inline +static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + + if (bits) { + if (bits > state->bits) { + bits -= state->bits; + /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ + state->w |= (v >> 1) >> (bits - 1); + + *state->p = state->w; + state->p = next_word(state->p); + state->w = 0; + state->bits = 64; + } + + state->bits -= bits; + state->w |= v << state->bits; + } +} + +__always_inline +static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 offset = le64_to_cpu(state->format->field_offset[field]); + + if (v < offset) + return false; + + v -= offset; + + if (fls64(v) > bits) + return false; + + __set_inc_field(state, field, v); + return true; +} + +/* + * Note: does NOT set out->format (we don't know what it should be here!) + * + * Also: doesn't work on extents - it doesn't preserve the invariant that + * if k is packed bkey_start_pos(k) will successfully pack + */ +static bool bch2_bkey_transform_key(const struct bkey_format *out_f, + struct bkey_packed *out, + const struct bkey_format *in_f, + const struct bkey_packed *in) +{ + struct pack_state out_s = pack_state_init(out_f, out); + struct unpack_state in_s = unpack_state_init(in_f, in); + u64 *w = out->_data; + unsigned i; + + *w = 0; + + for (i = 0; i < BKEY_NR_FIELDS; i++) + if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) + return false; + + /* Can't happen because the val would be too big to unpack: */ + EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); + + pack_state_finish(&out_s, out); + out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; + out->needs_whiteout = in->needs_whiteout; + out->type = in->type; + + return true; +} + +bool bch2_bkey_transform(const struct bkey_format *out_f, + struct bkey_packed *out, + const struct bkey_format *in_f, + const struct bkey_packed *in) +{ + if (!bch2_bkey_transform_key(out_f, out, in_f, in)) + return false; + + memcpy_u64s((u64 *) out + out_f->key_u64s, + (u64 *) in + in_f->key_u64s, + (in->u64s - in_f->key_u64s)); + return true; +} + +struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, + const struct bkey_packed *in) +{ + struct unpack_state state = unpack_state_init(format, in); + struct bkey out; + + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->u64s < format->key_u64s); + EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); + EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); + + out.u64s = BKEY_U64s + in->u64s - format->key_u64s; + out.format = KEY_FORMAT_CURRENT; + out.needs_whiteout = in->needs_whiteout; + out.type = in->type; + out.pad[0] = 0; + +#define x(id, field) out.field = get_inc_field(&state, id); + bkey_fields() +#undef x + + return out; +} + +#ifndef HAVE_BCACHEFS_COMPILED_UNPACK +struct bpos __bkey_unpack_pos(const struct bkey_format *format, + const struct bkey_packed *in) +{ + struct unpack_state state = unpack_state_init(format, in); + struct bpos out; + + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->u64s < format->key_u64s); + EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); + + out.inode = get_inc_field(&state, BKEY_FIELD_INODE); + out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); + out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); + + return out; +} +#endif + +/** + * bch2_bkey_pack_key -- pack just the key, not the value + * @out: packed result + * @in: key to pack + * @format: format of packed result + * + * Returns: true on success, false on failure + */ +bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, + const struct bkey_format *format) +{ + struct pack_state state = pack_state_init(format, out); + u64 *w = out->_data; + + EBUG_ON((void *) in == (void *) out); + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->format != KEY_FORMAT_CURRENT); + + *w = 0; + +#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; + bkey_fields() +#undef x + pack_state_finish(&state, out); + out->u64s = format->key_u64s + in->u64s - BKEY_U64s; + out->format = KEY_FORMAT_LOCAL_BTREE; + out->needs_whiteout = in->needs_whiteout; + out->type = in->type; + + bch2_bkey_pack_verify(out, in, format); + return true; +} + +/** + * bch2_bkey_unpack -- unpack the key and the value + * @b: btree node of @src key (for packed format) + * @dst: unpacked result + * @src: packed input + */ +void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, + const struct bkey_packed *src) +{ + __bkey_unpack_key(b, &dst->k, src); + + memcpy_u64s(&dst->v, + bkeyp_val(&b->format, src), + bkeyp_val_u64s(&b->format, src)); +} + +/** + * bch2_bkey_pack -- pack the key and the value + * @dst: packed result + * @src: unpacked input + * @format: format of packed result + * + * Returns: true on success, false on failure + */ +bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src, + const struct bkey_format *format) +{ + struct bkey_packed tmp; + + if (!bch2_bkey_pack_key(&tmp, &src->k, format)) + return false; + + memmove_u64s((u64 *) dst + format->key_u64s, + &src->v, + bkey_val_u64s(&src->k)); + memcpy_u64s_small(dst, &tmp, format->key_u64s); + + return true; +} + +__always_inline +static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 offset = le64_to_cpu(state->format->field_offset[field]); + bool ret = true; + + EBUG_ON(v < offset); + v -= offset; + + if (fls64(v) > bits) { + v = ~(~0ULL << bits); + ret = false; + } + + __set_inc_field(state, field, v); + return ret; +} + +#ifdef CONFIG_BCACHEFS_DEBUG +static bool bkey_packed_successor(struct bkey_packed *out, + const struct btree *b, + struct bkey_packed k) +{ + const struct bkey_format *f = &b->format; + unsigned nr_key_bits = b->nr_key_bits; + unsigned first_bit, offset; + u64 *p; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); + + if (!nr_key_bits) + return false; + + *out = k; + + first_bit = high_bit_offset + nr_key_bits - 1; + p = nth_word(high_word(f, out), first_bit >> 6); + offset = 63 - (first_bit & 63); + + while (nr_key_bits) { + unsigned bits = min(64 - offset, nr_key_bits); + u64 mask = (~0ULL >> (64 - bits)) << offset; + + if ((*p & mask) != mask) { + *p += 1ULL << offset; + EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); + return true; + } + + *p &= ~mask; + p = prev_word(p); + nr_key_bits -= bits; + offset = 0; + } + + return false; +} + +static bool bkey_format_has_too_big_fields(const struct bkey_format *f) +{ + for (unsigned i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) + return true; + } + + return false; +} +#endif + +/* + * Returns a packed key that compares <= in + * + * This is used in bset_search_tree(), where we need a packed pos in order to be + * able to compare against the keys in the auxiliary search tree - and it's + * legal to use a packed pos that isn't equivalent to the original pos, + * _provided_ it compares <= to the original pos. + */ +enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, + struct bpos in, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + struct pack_state state = pack_state_init(f, out); + u64 *w = out->_data; +#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos orig = in; +#endif + bool exact = true; + unsigned i; + + /* + * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 + * byte header, but pack_pos() won't if the len/version fields are big + * enough - we need to make sure to zero them out: + */ + for (i = 0; i < f->key_u64s; i++) + w[i] = 0; + + if (unlikely(in.snapshot < + le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { + if (!in.offset-- && + !in.inode--) + return BKEY_PACK_POS_FAIL; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(in.offset < + le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { + if (!in.inode--) + return BKEY_PACK_POS_FAIL; + in.offset = KEY_OFFSET_MAX; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(in.inode < + le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) + return BKEY_PACK_POS_FAIL; + + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) { + in.offset = KEY_OFFSET_MAX; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) { + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))) + exact = false; + + pack_state_finish(&state, out); + out->u64s = f->key_u64s; + out->format = KEY_FORMAT_LOCAL_BTREE; + out->type = KEY_TYPE_deleted; + +#ifdef CONFIG_BCACHEFS_DEBUG + if (exact) { + BUG_ON(bkey_cmp_left_packed(b, out, &orig)); + } else { + struct bkey_packed successor; + + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && + bkey_cmp_left_packed(b, &successor, &orig) < 0 && + !bkey_format_has_too_big_fields(f)); + } +#endif + + return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; +} + +void bch2_bkey_format_init(struct bkey_format_state *s) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(s->field_min); i++) + s->field_min[i] = U64_MAX; + + for (i = 0; i < ARRAY_SIZE(s->field_max); i++) + s->field_max[i] = 0; + + /* Make sure we can store a size of 0: */ + s->field_min[BKEY_FIELD_SIZE] = 0; +} + +void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) +{ + unsigned field = 0; + + __bkey_format_add(s, field++, p.inode); + __bkey_format_add(s, field++, p.offset); + __bkey_format_add(s, field++, p.snapshot); +} + +/* + * We don't want it to be possible for the packed format to represent fields + * bigger than a u64... that will cause confusion and issues (like with + * bkey_packed_successor()) + */ +static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, + unsigned bits, u64 offset) +{ + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + + bits = min(bits, unpacked_bits); + + offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); + + f->bits_per_field[i] = bits; + f->field_offset[i] = cpu_to_le64(offset); +} + +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) +{ + unsigned i, bits = KEY_PACKED_BITS_START; + struct bkey_format ret = { + .nr_fields = BKEY_NR_FIELDS, + }; + + for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { + s->field_min[i] = min(s->field_min[i], s->field_max[i]); + + set_format_field(&ret, i, + fls64(s->field_max[i] - s->field_min[i]), + s->field_min[i]); + + bits += ret.bits_per_field[i]; + } + + /* allow for extent merging: */ + if (ret.bits_per_field[BKEY_FIELD_SIZE]) { + unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]); + + ret.bits_per_field[BKEY_FIELD_SIZE] += b; + bits += b; + } + + ret.key_u64s = DIV_ROUND_UP(bits, 64); + + /* if we have enough spare bits, round fields up to nearest byte */ + bits = ret.key_u64s * 64 - bits; + + for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { + unsigned r = round_up(ret.bits_per_field[i], 8) - + ret.bits_per_field[i]; + + if (r <= bits) { + set_format_field(&ret, i, + ret.bits_per_field[i] + r, + le64_to_cpu(ret.field_offset[i])); + bits -= r; + } + } + +#ifdef CONFIG_BCACHEFS_DEBUG + { + struct printbuf buf = PRINTBUF; + + BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); + printbuf_exit(&buf); + } +#endif + return ret; +} + +int bch2_bkey_format_invalid(struct bch_fs *c, + struct bkey_format *f, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + unsigned i, bits = KEY_PACKED_BITS_START; + + if (f->nr_fields != BKEY_NR_FIELDS) { + prt_printf(err, "incorrect number of fields: got %u, should be %u", + f->nr_fields, BKEY_NR_FIELDS); + return -BCH_ERR_invalid; + } + + /* + * Verify that the packed format can't represent fields larger than the + * unpacked format: + */ + for (i = 0; i < f->nr_fields; i++) { + if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) { + prt_printf(err, "field %u too large: %llu + %llu > %llu", + i, packed_max, field_offset, unpacked_max); + return -BCH_ERR_invalid; + } + } + + bits += f->bits_per_field[i]; + } + + if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { + prt_printf(err, "incorrect key_u64s: got %u, should be %u", + f->key_u64s, DIV_ROUND_UP(bits, 64)); + return -BCH_ERR_invalid; + } + + return 0; +} + +void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) +{ + prt_printf(out, "u64s %u fields ", f->key_u64s); + + for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { + if (i) + prt_str(out, ", "); + prt_printf(out, "%u:%llu", + f->bits_per_field[i], + le64_to_cpu(f->field_offset[i])); + } +} + +/* + * Most significant differing bit + * Bits are indexed from 0 - return is [0, nr_key_bits) + */ +__pure +unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, + const struct bkey_packed *l_k, + const struct bkey_packed *r_k) +{ + const u64 *l = high_word(&b->format, l_k); + const u64 *r = high_word(&b->format, r_k); + unsigned nr_key_bits = b->nr_key_bits; + unsigned word_bits = 64 - high_bit_offset; + u64 l_v, r_v; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); + + /* for big endian, skip past header */ + l_v = *l & (~0ULL >> high_bit_offset); + r_v = *r & (~0ULL >> high_bit_offset); + + while (nr_key_bits) { + if (nr_key_bits < word_bits) { + l_v >>= word_bits - nr_key_bits; + r_v >>= word_bits - nr_key_bits; + nr_key_bits = 0; + } else { + nr_key_bits -= word_bits; + } + + if (l_v != r_v) + return fls64(l_v ^ r_v) - 1 + nr_key_bits; + + l = next_word(l); + r = next_word(r); + + l_v = *l; + r_v = *r; + word_bits = 64; + } + + return 0; +} + +/* + * First set bit + * Bits are indexed from 0 - return is [0, nr_key_bits) + */ +__pure +unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) +{ + const u64 *p = high_word(&b->format, k); + unsigned nr_key_bits = b->nr_key_bits; + unsigned ret = 0, offset; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); + + offset = nr_key_bits; + while (offset > 64) { + p = next_word(p); + offset -= 64; + } + + offset = 64 - offset; + + while (nr_key_bits) { + unsigned bits = nr_key_bits + offset < 64 + ? nr_key_bits + : 64 - offset; + + u64 mask = (~0ULL >> (64 - bits)) << offset; + + if (*p & mask) + return ret + __ffs64(*p & mask) - offset; + + p = prev_word(p); + nr_key_bits -= bits; + ret += bits; + offset = 0; + } + + return 0; +} + +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK + +#define I(_x) (*(out)++ = (_x)) +#define I1(i0) I(i0) +#define I2(i0, i1) (I1(i0), I(i1)) +#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) +#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) +#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) + +static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, + enum bch_bkey_fields field, + unsigned dst_offset, unsigned dst_size, + bool *eax_zeroed) +{ + unsigned bits = format->bits_per_field[field]; + u64 offset = le64_to_cpu(format->field_offset[field]); + unsigned i, byte, bit_offset, align, shl, shr; + + if (!bits && !offset) { + if (!*eax_zeroed) { + /* xor eax, eax */ + I2(0x31, 0xc0); + } + + *eax_zeroed = true; + goto set_field; + } + + if (!bits) { + /* just return offset: */ + + switch (dst_size) { + case 8: + if (offset > S32_MAX) { + /* mov [rdi + dst_offset], offset */ + I3(0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + + I3(0xc7, 0x47, dst_offset + 4); + memcpy(out, (void *) &offset + 4, 4); + out += 4; + } else { + /* mov [rdi + dst_offset], offset */ + /* sign extended */ + I4(0x48, 0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + } + break; + case 4: + /* mov [rdi + dst_offset], offset */ + I3(0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + break; + default: + BUG(); + } + + return out; + } + + bit_offset = format->key_u64s * 64; + for (i = 0; i <= field; i++) + bit_offset -= format->bits_per_field[i]; + + byte = bit_offset / 8; + bit_offset -= byte * 8; + + *eax_zeroed = false; + + if (bit_offset == 0 && bits == 8) { + /* movzx eax, BYTE PTR [rsi + imm8] */ + I4(0x0f, 0xb6, 0x46, byte); + } else if (bit_offset == 0 && bits == 16) { + /* movzx eax, WORD PTR [rsi + imm8] */ + I4(0x0f, 0xb7, 0x46, byte); + } else if (bit_offset + bits <= 32) { + align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 32); + + /* mov eax, [rsi + imm8] */ + I3(0x8b, 0x46, byte); + + if (bit_offset) { + /* shr eax, imm8 */ + I3(0xc1, 0xe8, bit_offset); + } + + if (bit_offset + bits < 32) { + unsigned mask = ~0U >> (32 - bits); + + /* and eax, imm32 */ + I1(0x25); + memcpy(out, &mask, 4); + out += 4; + } + } else if (bit_offset + bits <= 64) { + align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 64); + + /* mov rax, [rsi + imm8] */ + I4(0x48, 0x8b, 0x46, byte); + + shl = 64 - bit_offset - bits; + shr = bit_offset + shl; + + if (shl) { + /* shl rax, imm8 */ + I4(0x48, 0xc1, 0xe0, shl); + } + + if (shr) { + /* shr rax, imm8 */ + I4(0x48, 0xc1, 0xe8, shr); + } + } else { + align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 96); + + /* mov rax, [rsi + byte] */ + I4(0x48, 0x8b, 0x46, byte); + + /* mov edx, [rsi + byte + 8] */ + I3(0x8b, 0x56, byte + 8); + + /* bits from next word: */ + shr = bit_offset + bits - 64; + BUG_ON(shr > bit_offset); + + /* shr rax, bit_offset */ + I4(0x48, 0xc1, 0xe8, shr); + + /* shl rdx, imm8 */ + I4(0x48, 0xc1, 0xe2, 64 - shr); + + /* or rax, rdx */ + I3(0x48, 0x09, 0xd0); + + shr = bit_offset - shr; + + if (shr) { + /* shr rax, imm8 */ + I4(0x48, 0xc1, 0xe8, shr); + } + } + + /* rax += offset: */ + if (offset > S32_MAX) { + /* mov rdx, imm64 */ + I2(0x48, 0xba); + memcpy(out, &offset, 8); + out += 8; + /* add %rdx, %rax */ + I3(0x48, 0x01, 0xd0); + } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { + /* add rax, imm32 */ + I2(0x48, 0x05); + memcpy(out, &offset, 4); + out += 4; + } else if (offset) { + /* add eax, imm32 */ + I1(0x05); + memcpy(out, &offset, 4); + out += 4; + } +set_field: + switch (dst_size) { + case 8: + /* mov [rdi + dst_offset], rax */ + I4(0x48, 0x89, 0x47, dst_offset); + break; + case 4: + /* mov [rdi + dst_offset], eax */ + I3(0x89, 0x47, dst_offset); + break; + default: + BUG(); + } + + return out; +} + +int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) +{ + bool eax_zeroed = false; + u8 *out = _out; + + /* + * rdi: dst - unpacked key + * rsi: src - packed key + */ + + /* k->u64s, k->format, k->type */ + + /* mov eax, [rsi] */ + I2(0x8b, 0x06); + + /* add eax, BKEY_U64s - format->key_u64s */ + I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); + + /* and eax, imm32: mask out k->pad: */ + I5(0x25, 0xff, 0xff, 0xff, 0); + + /* mov [rdi], eax */ + I2(0x89, 0x07); + +#define x(id, field) \ + out = compile_bkey_field(format, out, id, \ + offsetof(struct bkey, field), \ + sizeof(((struct bkey *) NULL)->field), \ + &eax_zeroed); + bkey_fields() +#undef x + + /* retq */ + I1(0xc3); + + return (void *) out - _out; +} + +#else +#endif + +__pure +int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) +{ + return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); +} + +__pure __flatten +int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); +} + +__pure __flatten +int bch2_bkey_cmp_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed_inlined(b, l, r); +} + +__pure __flatten +int __bch2_bkey_cmp_left_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + const struct bkey *l_unpacked; + + return unlikely(l_unpacked = packed_to_bkey_c(l)) + ? bpos_cmp(l_unpacked->p, *r) + : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); +} + +void bch2_bpos_swab(struct bpos *p) +{ + u8 *l = (u8 *) p; + u8 *h = ((u8 *) &p[1]) - 1; + + while (l < h) { + swap(*l, *h); + l++; + --h; + } +} + +void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) +{ + const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; + u8 *l = k->key_start; + u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + + while (l < h) { + swap(*l, *h); + l++; + --h; + } +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_bkey_pack_test(void) +{ + struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); + struct bkey_packed p; + + struct bkey_format test_format = { + .key_u64s = 3, + .nr_fields = BKEY_NR_FIELDS, + .bits_per_field = { + 13, + 64, + 32, + }, + }; + + struct unpack_state in_s = + unpack_state_init(&bch2_bkey_format_current, (void *) &t); + struct pack_state out_s = pack_state_init(&test_format, &p); + unsigned i; + + for (i = 0; i < out_s.format->nr_fields; i++) { + u64 a, v = get_inc_field(&in_s, i); + + switch (i) { +#define x(id, field) case id: a = t.field; break; + bkey_fields() +#undef x + default: + BUG(); + } + + if (a != v) + panic("got %llu actual %llu i %u\n", v, a, i); + + if (!set_inc_field(&out_s, i, v)) + panic("failed at %u\n", i); + } + + BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); +} +#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h new file mode 100644 index 000000000000..831be01809f2 --- /dev/null +++ b/fs/bcachefs/bkey.h @@ -0,0 +1,778 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_H +#define _BCACHEFS_BKEY_H + +#include <linux/bug.h> +#include "bcachefs_format.h" + +#include "btree_types.h" +#include "util.h" +#include "vstructs.h" + +enum bkey_invalid_flags { + BKEY_INVALID_WRITE = (1U << 0), + BKEY_INVALID_COMMIT = (1U << 1), + BKEY_INVALID_JOURNAL = (1U << 2), +}; + +#if 0 + +/* + * compiled unpack functions are disabled, pending a new interface for + * dynamically allocating executable memory: + */ + +#ifdef CONFIG_X86_64 +#define HAVE_BCACHEFS_COMPILED_UNPACK 1 +#endif +#endif + +void bch2_bkey_packed_to_binary_text(struct printbuf *, + const struct bkey_format *, + const struct bkey_packed *); + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); +} + +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); +} + +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + +enum bkey_lr_packed { + BKEY_PACKED_BOTH, + BKEY_PACKED_RIGHT, + BKEY_PACKED_LEFT, + BKEY_PACKED_NONE, +}; + +#define bkey_lr_packed(_l, _r) \ + ((_l)->format + ((_r)->format << 1)) + +static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src) +{ + memcpy_u64s_small(dst, src, src->u64s); +} + +static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src) +{ + memcpy_u64s_small(dst, src, src->k.u64s); +} + +struct btree; + +__pure +unsigned bch2_bkey_greatest_differing_bit(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); +__pure +unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); + +__pure +int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, + const struct bkey_packed *, + const struct btree *); + +__pure +int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, + const struct bkey_packed *, + const struct bpos *); + +__pure +int bch2_bkey_cmp_packed(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); + +__pure +int __bch2_bkey_cmp_left_packed(const struct btree *, + const struct bkey_packed *, + const struct bpos *); + +static inline __pure +int bkey_cmp_left_packed(const struct btree *b, + const struct bkey_packed *l, const struct bpos *r) +{ + return __bch2_bkey_cmp_left_packed(b, l, r); +} + +/* + * The compiler generates better code when we pass bpos by ref, but it's often + * enough terribly convenient to pass it by val... as much as I hate c++, const + * ref would be nice here: + */ +__pure __flatten +static inline int bkey_cmp_left_packed_byval(const struct btree *b, + const struct bkey_packed *l, + struct bpos r) +{ + return bkey_cmp_left_packed(b, l, &r); +} + +static __always_inline bool bpos_eq(struct bpos l, struct bpos r) +{ + return !((l.inode ^ r.inode) | + (l.offset ^ r.offset) | + (l.snapshot ^ r.snapshot)); +} + +static __always_inline bool bpos_lt(struct bpos l, struct bpos r) +{ + return l.inode != r.inode ? l.inode < r.inode : + l.offset != r.offset ? l.offset < r.offset : + l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; +} + +static __always_inline bool bpos_le(struct bpos l, struct bpos r) +{ + return l.inode != r.inode ? l.inode < r.inode : + l.offset != r.offset ? l.offset < r.offset : + l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; +} + +static __always_inline bool bpos_gt(struct bpos l, struct bpos r) +{ + return bpos_lt(r, l); +} + +static __always_inline bool bpos_ge(struct bpos l, struct bpos r) +{ + return bpos_le(r, l); +} + +static __always_inline int bpos_cmp(struct bpos l, struct bpos r) +{ + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset) ?: + cmp_int(l.snapshot, r.snapshot); +} + +static inline struct bpos bpos_min(struct bpos l, struct bpos r) +{ + return bpos_lt(l, r) ? l : r; +} + +static inline struct bpos bpos_max(struct bpos l, struct bpos r) +{ + return bpos_gt(l, r) ? l : r; +} + +static __always_inline bool bkey_eq(struct bpos l, struct bpos r) +{ + return !((l.inode ^ r.inode) | + (l.offset ^ r.offset)); +} + +static __always_inline bool bkey_lt(struct bpos l, struct bpos r) +{ + return l.inode != r.inode + ? l.inode < r.inode + : l.offset < r.offset; +} + +static __always_inline bool bkey_le(struct bpos l, struct bpos r) +{ + return l.inode != r.inode + ? l.inode < r.inode + : l.offset <= r.offset; +} + +static __always_inline bool bkey_gt(struct bpos l, struct bpos r) +{ + return bkey_lt(r, l); +} + +static __always_inline bool bkey_ge(struct bpos l, struct bpos r) +{ + return bkey_le(r, l); +} + +static __always_inline int bkey_cmp(struct bpos l, struct bpos r) +{ + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset); +} + +static inline struct bpos bkey_min(struct bpos l, struct bpos r) +{ + return bkey_lt(l, r) ? l : r; +} + +static inline struct bpos bkey_max(struct bpos l, struct bpos r) +{ + return bkey_gt(l, r) ? l : r; +} + +void bch2_bpos_swab(struct bpos *); +void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); + +static __always_inline int bversion_cmp(struct bversion l, struct bversion r) +{ + return cmp_int(l.hi, r.hi) ?: + cmp_int(l.lo, r.lo); +} + +#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) +#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) + +static __always_inline int bversion_zero(struct bversion v) +{ + return !bversion_cmp(v, ZERO_VERSION); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +/* statement expressions confusing unlikely()? */ +#define bkey_packed(_k) \ + ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ + (_k)->format != KEY_FORMAT_CURRENT; }) +#else +#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) +#endif + +/* + * It's safe to treat an unpacked bkey as a packed one, but not the reverse + */ +static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) +{ + return (struct bkey_packed *) k; +} + +static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) +{ + return (const struct bkey_packed *) k; +} + +static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) +{ + return bkey_packed(k) ? NULL : (struct bkey_i *) k; +} + +static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) +{ + return bkey_packed(k) ? NULL : (const struct bkey *) k; +} + +static inline unsigned bkey_format_key_bits(const struct bkey_format *format) +{ + return format->bits_per_field[BKEY_FIELD_INODE] + + format->bits_per_field[BKEY_FIELD_OFFSET] + + format->bits_per_field[BKEY_FIELD_SNAPSHOT]; +} + +static inline struct bpos bpos_successor(struct bpos p) +{ + if (!++p.snapshot && + !++p.offset && + !++p.inode) + BUG(); + + return p; +} + +static inline struct bpos bpos_predecessor(struct bpos p) +{ + if (!p.snapshot-- && + !p.offset-- && + !p.inode--) + BUG(); + + return p; +} + +static inline struct bpos bpos_nosnap_successor(struct bpos p) +{ + p.snapshot = 0; + + if (!++p.offset && + !++p.inode) + BUG(); + + return p; +} + +static inline struct bpos bpos_nosnap_predecessor(struct bpos p) +{ + p.snapshot = 0; + + if (!p.offset-- && + !p.inode--) + BUG(); + + return p; +} + +static inline u64 bkey_start_offset(const struct bkey *k) +{ + return k->p.offset - k->size; +} + +static inline struct bpos bkey_start_pos(const struct bkey *k) +{ + return (struct bpos) { + .inode = k->p.inode, + .offset = bkey_start_offset(k), + .snapshot = k->p.snapshot, + }; +} + +/* Packed helpers */ + +static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, + const struct bkey_packed *k) +{ + unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; + + EBUG_ON(k->u64s < ret); + return ret; +} + +static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return bkeyp_key_u64s(format, k) * sizeof(u64); +} + +static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return k->u64s - bkeyp_key_u64s(format, k); +} + +static inline size_t bkeyp_val_bytes(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return bkeyp_val_u64s(format, k) * sizeof(u64); +} + +static inline void set_bkeyp_val_u64s(const struct bkey_format *format, + struct bkey_packed *k, unsigned val_u64s) +{ + k->u64s = bkeyp_key_u64s(format, k) + val_u64s; +} + +#define bkeyp_val(_format, _k) \ + ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k))) + +extern const struct bkey_format bch2_bkey_format_current; + +bool bch2_bkey_transform(const struct bkey_format *, + struct bkey_packed *, + const struct bkey_format *, + const struct bkey_packed *); + +struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, + const struct bkey_packed *); + +#ifndef HAVE_BCACHEFS_COMPILED_UNPACK +struct bpos __bkey_unpack_pos(const struct bkey_format *, + const struct bkey_packed *); +#endif + +bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, + const struct bkey_format *); + +enum bkey_pack_pos_ret { + BKEY_PACK_POS_EXACT, + BKEY_PACK_POS_SMALLER, + BKEY_PACK_POS_FAIL, +}; + +enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, + const struct btree *); + +static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, + const struct btree *b) +{ + return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; +} + +void bch2_bkey_unpack(const struct btree *, struct bkey_i *, + const struct bkey_packed *); +bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, + const struct bkey_format *); + +typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); + +static inline void +__bkey_unpack_key_format_checked(const struct btree *b, + struct bkey *dst, + const struct bkey_packed *src) +{ + if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { + compiled_unpack_fn unpack_fn = b->aux_data; + unpack_fn(dst, src); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + bch2_expensive_debug_checks) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); + } + } else { + *dst = __bch2_bkey_unpack_key(&b->format, src); + } +} + +static inline struct bkey +bkey_unpack_key_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ + struct bkey dst; + + __bkey_unpack_key_format_checked(b, &dst, src); + return dst; +} + +static inline void __bkey_unpack_key(const struct btree *b, + struct bkey *dst, + const struct bkey_packed *src) +{ + if (likely(bkey_packed(src))) + __bkey_unpack_key_format_checked(b, dst, src); + else + *dst = *packed_to_bkey_c(src); +} + +/** + * bkey_unpack_key -- unpack just the key, not the value + */ +static inline struct bkey bkey_unpack_key(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_key_format_checked(b, src) + : *packed_to_bkey_c(src); +} + +static inline struct bpos +bkey_unpack_pos_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK + return bkey_unpack_key_format_checked(b, src).p; +#else + return __bkey_unpack_pos(&b->format, src); +#endif +} + +static inline struct bpos bkey_unpack_pos(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_pos_format_checked(b, src) + : packed_to_bkey_c(src)->p; +} + +/* Disassembled bkeys */ + +static inline struct bkey_s_c bkey_disassemble(const struct btree *b, + const struct bkey_packed *k, + struct bkey *u) +{ + __bkey_unpack_key(b, u, k); + + return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; +} + +/* non const version: */ +static inline struct bkey_s __bkey_disassemble(const struct btree *b, + struct bkey_packed *k, + struct bkey *u) +{ + __bkey_unpack_key(b, u, k); + + return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; +} + +static inline u64 bkey_field_max(const struct bkey_format *f, + enum bch_bkey_fields nr) +{ + return f->bits_per_field[nr] < 64 + ? (le64_to_cpu(f->field_offset[nr]) + + ~(~0ULL << f->bits_per_field[nr])) + : U64_MAX; +} + +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK + +int bch2_compile_bkey_format(const struct bkey_format *, void *); + +#else + +static inline int bch2_compile_bkey_format(const struct bkey_format *format, + void *out) { return 0; } + +#endif + +static inline void bkey_reassemble(struct bkey_i *dst, + struct bkey_s_c src) +{ + dst->k = *src.k; + memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); +} + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define x(name, ...) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = KEY_TYPE_##name; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +BCH_BKEY_TYPES(); +#undef x + +/* byte order helpers */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + +static inline unsigned high_word_offset(const struct bkey_format *f) +{ + return f->key_u64s - 1; +} + +#define high_bit_offset 0 +#define nth_word(p, n) ((p) - (n)) + +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + +static inline unsigned high_word_offset(const struct bkey_format *f) +{ + return 0; +} + +#define high_bit_offset KEY_PACKED_BITS_START +#define nth_word(p, n) ((p) + (n)) + +#else +#error edit for your odd byteorder. +#endif + +#define high_word(f, k) ((u64 *) (k)->_data + high_word_offset(f)) +#define next_word(p) nth_word(p, 1) +#define prev_word(p) nth_word(p, -1) + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_bkey_pack_test(void); +#else +static inline void bch2_bkey_pack_test(void) {} +#endif + +#define bkey_fields() \ + x(BKEY_FIELD_INODE, p.inode) \ + x(BKEY_FIELD_OFFSET, p.offset) \ + x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ + x(BKEY_FIELD_SIZE, size) \ + x(BKEY_FIELD_VERSION_HI, version.hi) \ + x(BKEY_FIELD_VERSION_LO, version.lo) + +struct bkey_format_state { + u64 field_min[BKEY_NR_FIELDS]; + u64 field_max[BKEY_NR_FIELDS]; +}; + +void bch2_bkey_format_init(struct bkey_format_state *); + +static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) +{ + s->field_min[field] = min(s->field_min[field], v); + s->field_max[field] = max(s->field_max[field], v); +} + +/* + * Changes @format so that @k can be successfully packed with @format + */ +static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) +{ +#define x(id, field) __bkey_format_add(s, id, k->field); + bkey_fields() +#undef x +} + +void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); +int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, + enum bkey_invalid_flags, struct printbuf *); +void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); + +#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h new file mode 100644 index 000000000000..a30c4ae8eb36 --- /dev/null +++ b/fs/bcachefs/bkey_buf.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_BUF_H +#define _BCACHEFS_BKEY_BUF_H + +#include "bcachefs.h" +#include "bkey.h" + +struct bkey_buf { + struct bkey_i *k; + u64 onstack[12]; +}; + +static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, + struct bch_fs *c, unsigned u64s) +{ + if (s->k == (void *) s->onstack && + u64s > ARRAY_SIZE(s->onstack)) { + s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); + memcpy(s->k, s->onstack, sizeof(s->onstack)); + } +} + +static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_buf_realloc(s, c, k.k->u64s); + bkey_reassemble(s->k, k); +} + +static inline void bch2_bkey_buf_copy(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_i *src) +{ + bch2_bkey_buf_realloc(s, c, src->k.u64s); + bkey_copy(s->k, src); +} + +static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, + struct bch_fs *c, + struct btree *b, + struct bkey_packed *src) +{ + bch2_bkey_buf_realloc(s, c, BKEY_U64s + + bkeyp_val_u64s(&b->format, src)); + bch2_bkey_unpack(b, s->k, src); +} + +static inline void bch2_bkey_buf_init(struct bkey_buf *s) +{ + s->k = (void *) s->onstack; +} + +static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) +{ + if (s->k != (void *) s->onstack) + mempool_free(s->k, &c->large_bkey_pool); + s->k = NULL; +} + +#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h new file mode 100644 index 000000000000..5f42a6e69360 --- /dev/null +++ b/fs/bcachefs/bkey_cmp.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_CMP_H +#define _BCACHEFS_BKEY_CMP_H + +#include "bkey.h" + +#ifdef CONFIG_X86_64 +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + long d0, d1, d2, d3; + int cmp; + + /* we shouldn't need asm for this, but gcc is being retarded: */ + + asm(".intel_syntax noprefix;" + "xor eax, eax;" + "xor edx, edx;" + "1:;" + "mov r8, [rdi];" + "mov r9, [rsi];" + "sub ecx, 64;" + "jl 2f;" + + "cmp r8, r9;" + "jnz 3f;" + + "lea rdi, [rdi - 8];" + "lea rsi, [rsi - 8];" + "jmp 1b;" + + "2:;" + "not ecx;" + "shr r8, 1;" + "shr r9, 1;" + "shr r8, cl;" + "shr r9, cl;" + "cmp r8, r9;" + + "3:\n" + "seta al;" + "setb dl;" + "sub eax, edx;" + ".att_syntax prefix;" + : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) + : "0" (l), "1" (r), "3" (nr_key_bits) + : "r8", "r9", "cc", "memory"); + + return cmp; +} +#else +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + u64 l_v, r_v; + + if (!nr_key_bits) + return 0; + + /* for big endian, skip past header */ + nr_key_bits += high_bit_offset; + l_v = *l & (~0ULL >> high_bit_offset); + r_v = *r & (~0ULL >> high_bit_offset); + + while (1) { + if (nr_key_bits < 64) { + l_v >>= 64 - nr_key_bits; + r_v >>= 64 - nr_key_bits; + nr_key_bits = 0; + } else { + nr_key_bits -= 64; + } + + if (!nr_key_bits || l_v != r_v) + break; + + l = next_word(l); + r = next_word(r); + + l_v = *l; + r_v = *r; + } + + return cmp_int(l_v, r_v); +} +#endif + +static inline __pure __flatten +int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + int ret; + + EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); + + ret = __bkey_cmp_bits(high_word(f, l), + high_word(f, r), + b->nr_key_bits); + + EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), + bkey_unpack_pos(b, r))); + return ret; +} + +static inline __pure __flatten +int bch2_bkey_cmp_packed_inlined(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + struct bkey unpacked; + + if (likely(bkey_packed(l) && bkey_packed(r))) + return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); + + if (bkey_packed(l)) { + __bkey_unpack_key_format_checked(b, &unpacked, l); + l = (void *) &unpacked; + } else if (bkey_packed(r)) { + __bkey_unpack_key_format_checked(b, &unpacked, r); + r = (void *) &unpacked; + } + + return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); +} + +#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 index 000000000000..761f5e33b1e6 --- /dev/null +++ b/fs/bcachefs/bkey_methods.c @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "backpointers.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_types.h" +#include "alloc_background.h" +#include "dirent.h" +#include "ec.h" +#include "error.h" +#include "extents.h" +#include "inode.h" +#include "io_misc.h" +#include "lru.h" +#include "quota.h" +#include "reflink.h" +#include "snapshot.h" +#include "subvolume.h" +#include "xattr.h" + +const char * const bch2_bkey_types[] = { +#define x(name, nr) #name, + BCH_BKEY_TYPES() +#undef x + NULL +}; + +static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + return 0; +} + +#define bch2_bkey_ops_deleted ((struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +}) + +#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +}) + +static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_val_bytes(k.k), c, err, + bkey_val_size_nonzero, + "incorrect value size (%zu != 0)", + bkey_val_bytes(k.k)); +fsck_err: + return ret; +} + +#define bch2_bkey_ops_error ((struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ +}) + +static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + return 0; +} + +#define bch2_bkey_ops_cookie ((struct bkey_ops) { \ + .key_invalid = key_type_cookie_invalid, \ + .min_val_size = 8, \ +}) + +#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ + .key_invalid = empty_val_key_invalid, \ +}) + +static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + return 0; +} + +static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + prt_printf(out, "datalen %u: %*phN", + datalen, min(datalen, 32U), d.v->data); +} + +#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ + .key_invalid = key_type_inline_data_invalid, \ + .val_to_text = key_type_inline_data_to_text, \ +}) + +static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +#define bch2_bkey_ops_set ((struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ + .key_merge = key_type_set_merge, \ +}) + +const struct bkey_ops bch2_bkey_ops[] = { +#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, + BCH_BKEY_TYPES() +#undef x +}; + +const struct bkey_ops bch2_bkey_null_ops = { +}; + +int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + int ret = 0; + + bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err, + bkey_val_size_too_small, + "bad val size (%zu < %u)", + bkey_val_bytes(k.k), ops->min_val_size); + + if (!ops->key_invalid) + return 0; + + ret = ops->key_invalid(c, k, flags, err); +fsck_err: + return ret; +} + +static u64 bch2_key_types_allowed[] = { + [BKEY_TYPE_btree] = + BIT_ULL(KEY_TYPE_deleted)| + BIT_ULL(KEY_TYPE_btree_ptr)| + BIT_ULL(KEY_TYPE_btree_ptr_v2), +#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, + BCH_BTREE_IDS() +#undef x +}; + +const char *bch2_btree_node_type_str(enum btree_node_type type) +{ + return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); +} + +int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err, + bkey_u64s_too_small, + "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); + + if (type >= BKEY_TYPE_NR) + return 0; + + bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err, + bkey_invalid_type_for_btree, + "invalid key type for btree %s (%s)", + bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]); + + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + bkey_fsck_err_on(k.k->size == 0, c, err, + bkey_extent_size_zero, + "size == 0"); + + bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err, + bkey_extent_size_greater_than_offset, + "size greater than offset (%u > %llu)", + k.k->size, k.k->p.offset); + } else { + bkey_fsck_err_on(k.k->size, c, err, + bkey_size_nonzero, + "size != 0"); + } + + if (type != BKEY_TYPE_btree) { + enum btree_id btree = type - 1; + + if (btree_type_has_snapshots(btree)) { + bkey_fsck_err_on(!k.k->p.snapshot, c, err, + bkey_snapshot_zero, + "snapshot == 0"); + } else if (!btree_type_has_snapshot_field(btree)) { + bkey_fsck_err_on(k.k->p.snapshot, c, err, + bkey_snapshot_nonzero, + "nonzero snapshot"); + } else { + /* + * btree uses snapshot field but it's not required to be + * nonzero + */ + } + + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err, + bkey_at_pos_max, + "key at POS_MAX"); + } +fsck_err: + return ret; +} + +int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + return __bch2_bkey_invalid(c, k, type, flags, err) ?: + bch2_bkey_val_invalid(c, k, flags, err); +} + +int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err, + bkey_before_start_of_btree_node, + "key before start of btree node"); + + bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err, + bkey_after_end_of_btree_node, + "key past end of btree node"); +fsck_err: + return ret; +} + +void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) +{ + if (bpos_eq(pos, POS_MIN)) + prt_printf(out, "POS_MIN"); + else if (bpos_eq(pos, POS_MAX)) + prt_printf(out, "POS_MAX"); + else if (bpos_eq(pos, SPOS_MAX)) + prt_printf(out, "SPOS_MAX"); + else { + if (pos.inode == U64_MAX) + prt_printf(out, "U64_MAX"); + else + prt_printf(out, "%llu", pos.inode); + prt_printf(out, ":"); + if (pos.offset == U64_MAX) + prt_printf(out, "U64_MAX"); + else + prt_printf(out, "%llu", pos.offset); + prt_printf(out, ":"); + if (pos.snapshot == U32_MAX) + prt_printf(out, "U32_MAX"); + else + prt_printf(out, "%u", pos.snapshot); + } +} + +void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) +{ + if (k) { + prt_printf(out, "u64s %u type ", k->u64s); + + if (k->type < KEY_TYPE_MAX) + prt_printf(out, "%s ", bch2_bkey_types[k->type]); + else + prt_printf(out, "%u ", k->type); + + bch2_bpos_to_text(out, k->p); + + prt_printf(out, " len %u ver %llu", k->size, k->version.lo); + } else { + prt_printf(out, "(null)"); + } +} + +void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (likely(ops->val_to_text)) + ops->val_to_text(out, c, k); +} + +void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_to_text(out, k.k); + + if (bkey_val_bytes(k.k)) { + prt_printf(out, ": "); + bch2_val_to_text(out, c, k); + } +} + +void bch2_bkey_swab_val(struct bkey_s k) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (ops->swab) + ops->swab(k); +} + +bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + return ops->key_normalize + ? ops->key_normalize(c, k) + : false; +} + +bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); + + return ops->key_merge && + bch2_bkey_maybe_mergable(l.k, r.k) && + (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && + !bch2_key_merging_disabled && + ops->key_merge(c, l, r); +} + +static const struct old_bkey_type { + u8 btree_node_type; + u8 old; + u8 new; +} bkey_renumber_table[] = { + {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, + {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, + {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, + {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, + {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, + {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, + {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, + {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, + {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, + {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, +}; + +void bch2_bkey_renumber(enum btree_node_type btree_node_type, + struct bkey_packed *k, + int write) +{ + const struct old_bkey_type *i; + + for (i = bkey_renumber_table; + i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); + i++) + if (btree_node_type == i->btree_node_type && + k->type == (write ? i->new : i->old)) { + k->type = write ? i->old : i->new; + break; + } +} + +void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct bkey_format *f, + struct bkey_packed *k) +{ + const struct bkey_ops *ops; + struct bkey uk; + unsigned nr_compat = 5; + int i; + + /* + * Do these operations in reverse order in the write path: + */ + + for (i = 0; i < nr_compat; i++) + switch (!write ? i : nr_compat - 1 - i) { + case 0: + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_key(f, k); + break; + case 1: + if (version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); + break; + case 2: + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_inodes) { + if (!bkey_packed(k)) { + struct bkey_i *u = packed_to_bkey(k); + + swap(u->k.p.inode, u->k.p.offset); + } else if (f->bits_per_field[BKEY_FIELD_INODE] && + f->bits_per_field[BKEY_FIELD_OFFSET]) { + struct bkey_format tmp = *f, *in = f, *out = &tmp; + + swap(tmp.bits_per_field[BKEY_FIELD_INODE], + tmp.bits_per_field[BKEY_FIELD_OFFSET]); + swap(tmp.field_offset[BKEY_FIELD_INODE], + tmp.field_offset[BKEY_FIELD_OFFSET]); + + if (!write) + swap(in, out); + + uk = __bch2_bkey_unpack_key(in, k); + swap(uk.p.inode, uk.p.offset); + BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); + } + } + break; + case 3: + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + struct bkey_i *u = packed_to_bkey(k); + + if (u) { + u->k.p.snapshot = write + ? 0 : U32_MAX; + } else { + u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); + u64 max_packed = min_packed + + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + uk = __bch2_bkey_unpack_key(f, k); + uk.p.snapshot = write + ? min_packed : min_t(u64, U32_MAX, max_packed); + + BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); + } + } + + break; + case 4: { + struct bkey_s u; + + if (!bkey_packed(k)) { + u = bkey_i_to_s(packed_to_bkey(k)); + } else { + uk = __bch2_bkey_unpack_key(f, k); + u.k = &uk; + u.v = bkeyp_val(f, k); + } + + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_val(u); + + ops = bch2_bkey_type_ops(k->type); + + if (ops->compat) + ops->compat(btree_id, version, big_endian, write, u); + break; + } + default: + BUG(); + } +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 index 000000000000..3a370b7087ac --- /dev/null +++ b/fs/bcachefs/bkey_methods.h @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_METHODS_H +#define _BCACHEFS_BKEY_METHODS_H + +#include "bkey.h" + +struct bch_fs; +struct btree; +struct btree_trans; +struct bkey; +enum btree_node_type; + +extern const char * const bch2_bkey_types[]; +extern const struct bkey_ops bch2_bkey_null_ops; + +/* + * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * invalid, entire key will be deleted. + * + * When invalid, error string is returned via @err. @rw indicates whether key is + * being read or written; more aggressive checks can be enabled when rw == WRITE. + */ +struct bkey_ops { + int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void (*swab)(struct bkey_s); + bool (*key_normalize)(struct bch_fs *, struct bkey_s); + bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); + int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); + int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + void (*compat)(enum btree_id id, unsigned version, + unsigned big_endian, int write, + struct bkey_s); + + /* Size of value type when first created: */ + unsigned min_val_size; +}; + +extern const struct bkey_ops bch2_bkey_ops[]; + +static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) +{ + return likely(type < KEY_TYPE_MAX) + ? &bch2_bkey_ops[type] + : &bch2_bkey_null_ops; +} + +int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bkey_invalid_flags, struct printbuf *); +int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bkey_invalid_flags, struct printbuf *); +int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, + struct bkey_s_c, struct printbuf *); + +void bch2_bpos_to_text(struct printbuf *, struct bpos); +void bch2_bkey_to_text(struct printbuf *, const struct bkey *); +void bch2_val_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +void bch2_bkey_swab_val(struct bkey_s); + +bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); + +static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) +{ + return l->type == r->type && + !bversion_cmp(l->version, r->version) && + bpos_eq(l->p, bkey_start_pos(r)); +} + +bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +static inline int bch2_mark_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); + + return ops->atomic_trigger + ? ops->atomic_trigger(trans, btree, level, old, new, flags) + : 0; +} + +enum btree_update_flags { + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, + __BTREE_UPDATE_NOJOURNAL, + __BTREE_UPDATE_PREJOURNAL, + __BTREE_UPDATE_KEY_CACHE_RECLAIM, + + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + + __BTREE_TRIGGER_INSERT, + __BTREE_TRIGGER_OVERWRITE, + + __BTREE_TRIGGER_GC, + __BTREE_TRIGGER_BUCKET_INVALIDATE, + __BTREE_TRIGGER_NOATOMIC, +}; + +#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) +#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL) +#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) + +#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) + +#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) +#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) + +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) +#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + +static inline int bch2_trans_mark_key(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); + + return ops->trans_trigger + ? ops->trans_trigger(trans, btree_id, level, old, new, flags) + : 0; +} + +static inline int bch2_trans_mark_old(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = old.k->p; + + return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, + BTREE_TRIGGER_OVERWRITE|flags); +} + +static inline int bch2_trans_mark_new(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_i *new, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = new->k.p; + + return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); +} + +void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); + +void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, + int, struct bkey_format *, struct bkey_packed *); + +static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct bkey_format *f, + struct bkey_packed *k) +{ + if (version < bcachefs_metadata_version_current || + big_endian != CPU_BIG_ENDIAN) + __bch2_bkey_compat(level, btree_id, version, + big_endian, write, f, k); + +} + +#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 index 000000000000..bcca9e76a0b4 --- /dev/null +++ b/fs/bcachefs/bkey_sort.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" +#include "bkey_cmp.h" +#include "bkey_sort.h" +#include "bset.h" +#include "extents.h" + +typedef int (*sort_cmp_fn)(struct btree *, + struct bkey_packed *, + struct bkey_packed *); + +static inline bool sort_iter_end(struct sort_iter *iter) +{ + return !iter->used; +} + +static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, + sort_cmp_fn cmp) +{ + unsigned i; + + for (i = from; + i + 1 < iter->used && + cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; + i++) + swap(iter->data[i], iter->data[i + 1]); +} + +static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) +{ + unsigned i = iter->used; + + while (i--) + sort_iter_sift(iter, i, cmp); +} + +static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) +{ + return !sort_iter_end(iter) ? iter->data->k : NULL; +} + +static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) +{ + struct sort_iter_set *i = iter->data; + + BUG_ON(!iter->used); + + i->k = bkey_p_next(i->k); + + BUG_ON(i->k > i->end); + + if (i->k == i->end) + array_remove_item(iter->data, iter->used, 0); + else + sort_iter_sift(iter, 0, cmp); +} + +static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, + sort_cmp_fn cmp) +{ + struct bkey_packed *ret = sort_iter_peek(iter); + + if (ret) + sort_iter_advance(iter, cmp); + + return ret; +} + +/* + * If keys compare equal, compare by pointer order: + */ +static inline int key_sort_fix_overlapping_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) ?: + cmp_int((unsigned long) l, (unsigned long) r); +} + +static inline bool should_drop_next_key(struct sort_iter *iter) +{ + /* + * key_sort_cmp() ensures that when keys compare equal the older key + * comes first; so if l->k compares equal to r->k then l->k is older + * and should be dropped. + */ + return iter->used >= 2 && + !bch2_bkey_cmp_packed(iter->b, + iter->data[0].k, + iter->data[1].k); +} + +struct btree_nr_keys +bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct sort_iter *iter) +{ + struct bkey_packed *out = dst->start; + struct bkey_packed *k; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + sort_iter_sort(iter, key_sort_fix_overlapping_cmp); + + while ((k = sort_iter_peek(iter))) { + if (!bkey_deleted(k) && + !should_drop_next_key(iter)) { + bkey_p_copy(out, k); + btree_keys_account_key_add(&nr, 0, out); + out = bkey_p_next(out); + } + + sort_iter_advance(iter, key_sort_fix_overlapping_cmp); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +/* Sort + repack in a new format: */ +struct btree_nr_keys +bch2_sort_repack(struct bset *dst, struct btree *src, + struct btree_node_iter *src_iter, + struct bkey_format *out_f, + bool filter_whiteouts) +{ + struct bkey_format *in_f = &src->format; + struct bkey_packed *in, *out = vstruct_last(dst); + struct btree_nr_keys nr; + bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); + + memset(&nr, 0, sizeof(nr)); + + while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { + if (filter_whiteouts && bkey_deleted(in)) + continue; + + if (!transform) + bkey_p_copy(out, in); + else if (bch2_bkey_transform(out_f, out, bkey_packed(in) + ? in_f : &bch2_bkey_format_current, in)) + out->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(src, (void *) out, in); + + out->needs_whiteout = false; + + btree_keys_account_key_add(&nr, 0, out); + out = bkey_p_next(out); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed_inlined(b, l, r) ?: + (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; +} + +unsigned bch2_sort_keys(struct bkey_packed *dst, + struct sort_iter *iter, + bool filter_whiteouts) +{ + const struct bkey_format *f = &iter->b->format; + struct bkey_packed *in, *next, *out = dst; + + sort_iter_sort(iter, sort_keys_cmp); + + while ((in = sort_iter_next(iter, sort_keys_cmp))) { + bool needs_whiteout = false; + + if (bkey_deleted(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + + while ((next = sort_iter_peek(iter)) && + !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + needs_whiteout |= in->needs_whiteout; + in = sort_iter_next(iter, sort_keys_cmp); + } + + if (bkey_deleted(in)) { + memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); + set_bkeyp_val_u64s(f, out, 0); + } else { + bkey_p_copy(out, in); + } + out->needs_whiteout |= needs_whiteout; + out = bkey_p_next(out); + } + + return (u64 *) out - (u64 *) dst; +} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h new file mode 100644 index 000000000000..7c0f0b160f18 --- /dev/null +++ b/fs/bcachefs/bkey_sort.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_SORT_H +#define _BCACHEFS_BKEY_SORT_H + +struct sort_iter { + struct btree *b; + unsigned used; + unsigned size; + + struct sort_iter_set { + struct bkey_packed *k, *end; + } data[]; +}; + +static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size) +{ + iter->b = b; + iter->used = 0; + iter->size = size; +} + +struct sort_iter_stack { + struct sort_iter iter; + struct sort_iter_set sets[MAX_BSETS + 1]; +}; + +static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b) +{ + sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets)); +} + +static inline void sort_iter_add(struct sort_iter *iter, + struct bkey_packed *k, + struct bkey_packed *end) +{ + BUG_ON(iter->used >= iter->size); + + if (k != end) + iter->data[iter->used++] = (struct sort_iter_set) { k, end }; +} + +struct btree_nr_keys +bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, + struct sort_iter *); + +struct btree_nr_keys +bch2_sort_repack(struct bset *, struct btree *, + struct btree_node_iter *, + struct bkey_format *, bool); + +unsigned bch2_sort_keys(struct bkey_packed *, + struct sort_iter *, bool); + +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c new file mode 100644 index 000000000000..bb73ba9017b0 --- /dev/null +++ b/fs/bcachefs/bset.c @@ -0,0 +1,1592 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for working with individual keys, and sorted sets of keys with in a + * btree node + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "btree_cache.h" +#include "bset.h" +#include "eytzinger.h" +#include "trace.h" +#include "util.h" + +#include <asm/unaligned.h> +#include <linux/console.h> +#include <linux/random.h> +#include <linux/prefetch.h> + +static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, + struct btree *); + +static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) +{ + unsigned n = ARRAY_SIZE(iter->data); + + while (n && __btree_node_iter_set_end(iter, n - 1)) + --n; + + return n; +} + +struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) +{ + return bch2_bkey_to_bset_inlined(b, k); +} + +/* + * There are never duplicate live keys in the btree - but including keys that + * have been flagged as deleted (and will be cleaned up later) we _will_ see + * duplicates. + * + * Thus the sort order is: usual key comparison first, but for keys that compare + * equal the deleted key(s) come first, and the (at most one) live version comes + * last. + * + * The main reason for this is insertion: to handle overwrites, we first iterate + * over keys that compare equal to our insert key, and then insert immediately + * prior to the first key greater than the key we're inserting - our insert + * position will be after all keys that compare equal to our insert key, which + * by the time we actually do the insert will all be deleted. + */ + +void bch2_dump_bset(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned set) +{ + struct bkey_packed *_k, *_n; + struct bkey uk, n; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + + if (!i->u64s) + return; + + for (_k = i->start; + _k < vstruct_last(i); + _k = _n) { + _n = bkey_p_next(_k); + + k = bkey_disassemble(b, _k, &uk); + + printbuf_reset(&buf); + if (c) + bch2_bkey_val_to_text(&buf, c, k); + else + bch2_bkey_to_text(&buf, k.k); + printk(KERN_ERR "block %u key %5zu: %s\n", set, + _k->_data - i->_data, buf.buf); + + if (_n == vstruct_last(i)) + continue; + + n = bkey_unpack_key(b, _n); + + if (bpos_lt(n.p, k.k->p)) { + printk(KERN_ERR "Key skipped backwards\n"); + continue; + } + + if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) + printk(KERN_ERR "Duplicate keys\n"); + } + + printbuf_exit(&buf); +} + +void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) +{ + struct bset_tree *t; + + console_lock(); + for_each_bset(b, t) + bch2_dump_bset(c, b, bset(b, t), t - b->set); + console_unlock(); +} + +void bch2_dump_btree_node_iter(struct btree *b, + struct btree_node_iter *iter) +{ + struct btree_node_iter_set *set; + struct printbuf buf = PRINTBUF; + + printk(KERN_ERR "btree node iter with %u/%u sets:\n", + __btree_node_iter_used(iter), b->nsets); + + btree_node_iter_for_each(iter, set) { + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey uk = bkey_unpack_key(b, k); + + printbuf_reset(&buf); + bch2_bkey_to_text(&buf, &uk); + printk(KERN_ERR "set %zu key %u: %s\n", + t - b->set, set->k, buf.buf); + } + + printbuf_exit(&buf); +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct bset_tree *t; + struct bkey_packed *k; + struct btree_nr_keys nr = { 0 }; + + for_each_bset(b, t) + bset_tree_for_each_key(b, t, k) + if (!bkey_deleted(k)) + btree_keys_account_key_add(&nr, t - b->set, k); + + BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); +} + +static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + struct btree *b) +{ + struct btree_node_iter iter = *_iter; + const struct bkey_packed *k, *n; + + k = bch2_btree_node_iter_peek_all(&iter, b); + __bch2_btree_node_iter_advance(&iter, b); + n = bch2_btree_node_iter_peek_all(&iter, b); + + bkey_unpack_key(b, k); + + if (n && + bkey_iter_cmp(b, k, n) > 0) { + struct btree_node_iter_set *set; + struct bkey ku = bkey_unpack_key(b, k); + struct bkey nu = bkey_unpack_key(b, n); + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + + bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&buf1, &ku); + bch2_bkey_to_text(&buf2, &nu); + printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", + buf1.buf, buf2.buf); + printk(KERN_ERR "iter was:"); + + btree_node_iter_for_each(_iter, set) { + struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k2); + printk(" [%zi %zi]", t - b->set, + k2->_data - bset(b, t)->_data); + } + panic("\n"); + } +} + +void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) +{ + struct btree_node_iter_set *set, *s2; + struct bkey_packed *k, *p; + struct bset_tree *t; + + if (bch2_btree_node_iter_end(iter)) + return; + + /* Verify no duplicates: */ + btree_node_iter_for_each(iter, set) { + BUG_ON(set->k > set->end); + btree_node_iter_for_each(iter, s2) + BUG_ON(set != s2 && set->end == s2->end); + } + + /* Verify that set->end is correct: */ + btree_node_iter_for_each(iter, set) { + for_each_bset(b, t) + if (set->end == t->end_offset) + goto found; + BUG(); +found: + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); + } + + /* Verify iterator is sorted: */ + btree_node_iter_for_each(iter, set) + BUG_ON(set != iter->data && + btree_node_iter_cmp(b, set[-1], set[0]) > 0); + + k = bch2_btree_node_iter_peek_all(iter, b); + + for_each_bset(b, t) { + if (iter->data[0].end == t->end_offset) + continue; + + p = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); + + BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); + } +} + +void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); + struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s); + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; +#if 0 + BUG_ON(prev && + bkey_iter_cmp(b, prev, insert) > 0); +#else + if (prev && + bkey_iter_cmp(b, prev, insert) > 0) { + struct bkey k1 = bkey_unpack_key(b, prev); + struct bkey k2 = bkey_unpack_key(b, insert); + + bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); + + panic("prev > insert:\n" + "prev key %s\n" + "insert key %s\n", + buf1.buf, buf2.buf); + } +#endif +#if 0 + BUG_ON(next != btree_bkey_last(b, t) && + bkey_iter_cmp(b, insert, next) > 0); +#else + if (next != btree_bkey_last(b, t) && + bkey_iter_cmp(b, insert, next) > 0) { + struct bkey k1 = bkey_unpack_key(b, insert); + struct bkey k2 = bkey_unpack_key(b, next); + + bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); + + panic("insert > next:\n" + "insert key %s\n" + "next key %s\n", + buf1.buf, buf2.buf); + } +#endif +} + +#else + +static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + struct btree *b) {} + +#endif + +/* Auxiliary search trees */ + +#define BFLOAT_FAILED_UNPACKED U8_MAX +#define BFLOAT_FAILED U8_MAX + +struct bkey_float { + u8 exponent; + u8 key_offset; + u16 mantissa; +}; +#define BKEY_MANTISSA_BITS 16 + +static unsigned bkey_float_byte_offset(unsigned idx) +{ + return idx * sizeof(struct bkey_float); +} + +struct ro_aux_tree { + u8 nothing[0]; + struct bkey_float f[]; +}; + +struct rw_aux_tree { + u16 offset; + struct bpos k; +}; + +static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) +{ + BUG_ON(t->aux_data_offset == U16_MAX); + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + return t->aux_data_offset; + case BSET_RO_AUX_TREE: + return t->aux_data_offset + + DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + + t->size * sizeof(u8), 8); + case BSET_RW_AUX_TREE: + return t->aux_data_offset + + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); + default: + BUG(); + } +} + +static unsigned bset_aux_tree_buf_start(const struct btree *b, + const struct bset_tree *t) +{ + return t == b->set + ? DIV_ROUND_UP(b->unpack_fn_len, 8) + : bset_aux_tree_buf_end(t - 1); +} + +static void *__aux_tree_base(const struct btree *b, + const struct bset_tree *t) +{ + return b->aux_data + t->aux_data_offset * 8; +} + +static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + + return __aux_tree_base(b, t); +} + +static u8 *ro_aux_tree_prev(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + + return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); +} + +static struct bkey_float *bkey_float(const struct btree *b, + const struct bset_tree *t, + unsigned idx) +{ + return ro_aux_tree_base(b, t)->f + idx; +} + +static void bset_aux_tree_verify(const struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + const struct bset_tree *t; + + for_each_bset(b, t) { + if (t->aux_data_offset == U16_MAX) + continue; + + BUG_ON(t != b->set && + t[-1].aux_data_offset == U16_MAX); + + BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); + BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); + BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); + } +#endif +} + +void bch2_btree_keys_init(struct btree *b) +{ + unsigned i; + + b->nsets = 0; + memset(&b->nr, 0, sizeof(b->nr)); + + for (i = 0; i < MAX_BSETS; i++) + b->set[i].data_offset = U16_MAX; + + bch2_bset_set_no_aux_tree(b, b->set); +} + +/* Binary tree stuff for auxiliary search trees */ + +/* + * Cacheline/offset <-> bkey pointer arithmetic: + * + * t->tree is a binary search tree in an array; each node corresponds to a key + * in one cacheline in t->set (BSET_CACHELINE bytes). + * + * This means we don't have to store the full index of the key that a node in + * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and + * then bkey_float->m gives us the offset within that cacheline, in units of 8 + * bytes. + * + * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to + * make this work. + * + * To construct the bfloat for an arbitrary key we need to know what the key + * immediately preceding it is: we have to check if the two keys differ in the + * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size + * of the previous key so we can walk backwards to it from t->tree[j]'s key. + */ + +static inline void *bset_cacheline(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline) +{ + return (void *) round_down((unsigned long) btree_bkey_first(b, t), + L1_CACHE_BYTES) + + cacheline * BSET_CACHELINE; +} + +static struct bkey_packed *cacheline_to_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + unsigned offset) +{ + return bset_cacheline(b, t, cacheline) + offset * 8; +} + +static unsigned bkey_to_cacheline(const struct btree *b, + const struct bset_tree *t, + const struct bkey_packed *k) +{ + return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; +} + +static ssize_t __bkey_to_cacheline_offset(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + const struct bkey_packed *k) +{ + return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); +} + +static unsigned bkey_to_cacheline_offset(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + const struct bkey_packed *k) +{ + size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); + + EBUG_ON(m > U8_MAX); + return m; +} + +static inline struct bkey_packed *tree_to_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned j) +{ + return cacheline_to_bkey(b, t, + __eytzinger1_to_inorder(j, t->size - 1, t->extra), + bkey_float(b, t, j)->key_offset); +} + +static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned j) +{ + unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; + + return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); +} + +static struct rw_aux_tree *rw_aux_tree(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + + return __aux_tree_base(b, t); +} + +/* + * For the write set - the one we're currently inserting keys into - we don't + * maintain a full search tree, we just keep a simple lookup table in t->prev. + */ +static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, + struct bset_tree *t, + unsigned j) +{ + return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); +} + +static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, + unsigned j, struct bkey_packed *k) +{ + EBUG_ON(k >= btree_bkey_last(b, t)); + + rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { + .offset = __btree_node_key_to_offset(b, k), + .k = bkey_unpack_pos(b, k), + }; +} + +static void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bset_tree *t) +{ + struct bkey_packed *k = btree_bkey_first(b, t); + unsigned j = 0; + + if (!bch2_expensive_debug_checks) + return; + + BUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) + return; + + BUG_ON(t->size < 1); + BUG_ON(rw_aux_to_bkey(b, t, j) != k); + + goto start; + while (1) { + if (rw_aux_to_bkey(b, t, j) == k) { + BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, + bkey_unpack_pos(b, k))); +start: + if (++j == t->size) + break; + + BUG_ON(rw_aux_tree(b, t)[j].offset <= + rw_aux_tree(b, t)[j - 1].offset); + } + + k = bkey_p_next(k); + BUG_ON(k >= btree_bkey_last(b, t)); + } +} + +/* returns idx of first entry >= offset: */ +static unsigned rw_aux_tree_bsearch(struct btree *b, + struct bset_tree *t, + unsigned offset) +{ + unsigned bset_offs = offset - btree_bkey_first_offset(t); + unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); + unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; + + EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + EBUG_ON(!t->size); + EBUG_ON(idx > t->size); + + while (idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset) + idx++; + + while (idx && + rw_aux_tree(b, t)[idx - 1].offset >= offset) + idx--; + + EBUG_ON(idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset); + EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); + EBUG_ON(idx + 1 < t->size && + rw_aux_tree(b, t)[idx].offset == + rw_aux_tree(b, t)[idx + 1].offset); + + return idx; +} + +static inline unsigned bkey_mantissa(const struct bkey_packed *k, + const struct bkey_float *f, + unsigned idx) +{ + u64 v; + + EBUG_ON(!bkey_packed(k)); + + v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); + + /* + * In little endian, we're shifting off low bits (and then the bits we + * want are at the low end), in big endian we're shifting off high bits + * (and then the bits we want are at the high end, so we shift them + * back down): + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + v >>= f->exponent & 7; +#else + v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; +#endif + return (u16) v; +} + +static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) +{ + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); + struct bkey_packed *l = is_power_of_2(j) + ? min_key + : tree_to_prev_bkey(b, t, j >> ffs(j)); + struct bkey_packed *r = is_power_of_2(j + 1) + ? max_key + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); + unsigned mantissa; + int shift, exponent, high_bit; + + /* + * for failed bfloats, the lookup code falls back to comparing against + * the original key. + */ + + if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || + !b->nr_key_bits) { + f->exponent = BFLOAT_FAILED_UNPACKED; + return; + } + + /* + * The greatest differing bit of l and r is the first bit we must + * include in the bfloat mantissa we're creating in order to do + * comparisons - that bit always becomes the high bit of + * bfloat->mantissa, and thus the exponent we're calculating here is + * the position of what will become the low bit in bfloat->mantissa: + * + * Note that this may be negative - we may be running off the low end + * of the key: we handle this later: + */ + high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), + min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); + exponent = high_bit - (BKEY_MANTISSA_BITS - 1); + + /* + * Then we calculate the actual shift value, from the start of the key + * (k->_data), to get the key bits starting at exponent: + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; + + EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); +#else + shift = high_bit_offset + + b->nr_key_bits - + exponent - + BKEY_MANTISSA_BITS; + + EBUG_ON(shift < KEY_PACKED_BITS_START); +#endif + EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); + + f->exponent = shift; + mantissa = bkey_mantissa(m, f, j); + + /* + * If we've got garbage bits, set them to all 1s - it's legal for the + * bfloat to compare larger than the original key, but not smaller: + */ + if (exponent < 0) + mantissa |= ~(~0U << -exponent); + + f->mantissa = mantissa; +} + +/* bytes remaining - only valid for last bset: */ +static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + bset_aux_tree_verify(b); + + return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); +} + +static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + return __bset_tree_capacity(b, t) / + (sizeof(struct bkey_float) + sizeof(u8)); +} + +static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); +} + +static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *k; + + t->size = 1; + t->extra = BSET_RW_AUX_TREE_VAL; + rw_aux_tree(b, t)[0].offset = + __btree_node_key_to_offset(b, btree_bkey_first(b, t)); + + bset_tree_for_each_key(b, t, k) { + if (t->size == bset_rw_tree_capacity(b, t)) + break; + + if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > + L1_CACHE_BYTES) + rw_aux_tree_set(b, t, t->size++, k); + } +} + +static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); + struct bkey_i min_key, max_key; + unsigned j, cacheline = 1; + + t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), + bset_ro_tree_capacity(b, t)); +retry: + if (t->size < 2) { + t->size = 0; + t->extra = BSET_NO_AUX_TREE_VAL; + return; + } + + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ + eytzinger1_for_each(j, t->size - 1) { + while (bkey_to_cacheline(b, t, k) < cacheline) + prev = k, k = bkey_p_next(k); + + if (k >= btree_bkey_last(b, t)) { + /* XXX: this path sucks */ + t->size--; + goto retry; + } + + ro_aux_tree_prev(b, t)[j] = prev->u64s; + bkey_float(b, t, j)->key_offset = + bkey_to_cacheline_offset(b, t, cacheline++, k); + + EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); + EBUG_ON(tree_to_bkey(b, t, j) != k); + } + + while (k != btree_bkey_last(b, t)) + prev = k, k = bkey_p_next(k); + + if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; + } + + if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { + bkey_init(&max_key.k); + max_key.k.p = b->data->max_key; + } + + /* Then we build the tree */ + eytzinger1_for_each(j, t->size - 1) + make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); +} + +static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +{ + struct bset_tree *i; + + for (i = b->set; i != t; i++) + BUG_ON(bset_has_rw_aux_tree(i)); + + bch2_bset_set_no_aux_tree(b, t); + + /* round up to next cacheline: */ + t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), + SMP_CACHE_BYTES / sizeof(u64)); + + bset_aux_tree_verify(b); +} + +void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, + bool writeable) +{ + if (writeable + ? bset_has_rw_aux_tree(t) + : bset_has_ro_aux_tree(t)) + return; + + bset_alloc_tree(b, t); + + if (!__bset_tree_capacity(b, t)) + return; + + if (writeable) + __build_rw_aux_tree(b, t); + else + __build_ro_aux_tree(b, t); + + bset_aux_tree_verify(b); +} + +void bch2_bset_init_first(struct btree *b, struct bset *i) +{ + struct bset_tree *t; + + BUG_ON(b->nsets); + + memset(i, 0, sizeof(*i)); + get_random_bytes(&i->seq, sizeof(i->seq)); + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + t = &b->set[b->nsets++]; + set_btree_bset(b, t, i); +} + +void bch2_bset_init_next(struct bch_fs *c, struct btree *b, + struct btree_node_entry *bne) +{ + struct bset *i = &bne->keys; + struct bset_tree *t; + + BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); + BUG_ON(b->nsets >= MAX_BSETS); + + memset(i, 0, sizeof(*i)); + i->seq = btree_bset_first(b)->seq; + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + t = &b->set[b->nsets++]; + set_btree_bset(b, t, i); +} + +/* + * find _some_ key in the same bset as @k that precedes @k - not necessarily the + * immediate predecessor: + */ +static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + struct bkey_packed *p; + unsigned offset; + int j; + + EBUG_ON(k < btree_bkey_first(b, t) || + k > btree_bkey_last(b, t)); + + if (k == btree_bkey_first(b, t)) + return NULL; + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + p = btree_bkey_first(b, t); + break; + case BSET_RO_AUX_TREE: + j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); + + do { + p = j ? tree_to_bkey(b, t, + __inorder_to_eytzinger1(j--, + t->size - 1, t->extra)) + : btree_bkey_first(b, t); + } while (p >= k); + break; + case BSET_RW_AUX_TREE: + offset = __btree_node_key_to_offset(b, k); + j = rw_aux_tree_bsearch(b, t, offset); + p = j ? rw_aux_to_bkey(b, t, j - 1) + : btree_bkey_first(b, t); + break; + } + + return p; +} + +struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + struct bset_tree *t, + struct bkey_packed *k, + unsigned min_key_type) +{ + struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; + + while ((p = __bkey_prev(b, t, k)) && !ret) { + for (i = p; i != k; i = bkey_p_next(i)) + if (i->type >= min_key_type) + ret = i; + + k = p; + } + + if (bch2_expensive_debug_checks) { + BUG_ON(ret >= orig_k); + + for (i = ret + ? bkey_p_next(ret) + : btree_bkey_first(b, t); + i != orig_k; + i = bkey_p_next(i)) + BUG_ON(i->type >= min_key_type); + } + + return ret; +} + +/* Insert */ + +static void bch2_bset_fix_lookup_table(struct btree *b, + struct bset_tree *t, + struct bkey_packed *_where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + int shift = new_u64s - clobber_u64s; + unsigned l, j, where = __btree_node_key_to_offset(b, _where); + + EBUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) + return; + + /* returns first entry >= where */ + l = rw_aux_tree_bsearch(b, t, where); + + if (!l) /* never delete first entry */ + l++; + else if (l < t->size && + where < t->end_offset && + rw_aux_tree(b, t)[l].offset == where) + rw_aux_tree_set(b, t, l++, _where); + + /* l now > where */ + + for (j = l; + j < t->size && + rw_aux_tree(b, t)[j].offset < where + clobber_u64s; + j++) + ; + + if (j < t->size && + rw_aux_tree(b, t)[j].offset + shift == + rw_aux_tree(b, t)[l - 1].offset) + j++; + + memmove(&rw_aux_tree(b, t)[l], + &rw_aux_tree(b, t)[j], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[j]); + t->size -= j - l; + + for (j = l; j < t->size; j++) + rw_aux_tree(b, t)[j].offset += shift; + + EBUG_ON(l < t->size && + rw_aux_tree(b, t)[l].offset == + rw_aux_tree(b, t)[l - 1].offset); + + if (t->size < bset_rw_tree_capacity(b, t) && + (l < t->size + ? rw_aux_tree(b, t)[l].offset + : t->end_offset) - + rw_aux_tree(b, t)[l - 1].offset > + L1_CACHE_BYTES / sizeof(u64)) { + struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); + struct bkey_packed *end = l < t->size + ? rw_aux_to_bkey(b, t, l) + : btree_bkey_last(b, t); + struct bkey_packed *k = start; + + while (1) { + k = bkey_p_next(k); + if (k == end) + break; + + if ((void *) k - (void *) start >= L1_CACHE_BYTES) { + memmove(&rw_aux_tree(b, t)[l + 1], + &rw_aux_tree(b, t)[l], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[l]); + t->size++; + rw_aux_tree_set(b, t, l, k); + break; + } + } + } + + bch2_bset_verify_rw_aux_tree(b, t); + bset_aux_tree_verify(b); +} + +void bch2_bset_insert(struct btree *b, + struct btree_node_iter *iter, + struct bkey_packed *where, + struct bkey_i *insert, + unsigned clobber_u64s) +{ + struct bkey_format *f = &b->format; + struct bset_tree *t = bset_tree_last(b); + struct bkey_packed packed, *src = bkey_to_packed(insert); + + bch2_bset_verify_rw_aux_tree(b, t); + bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); + + if (bch2_bkey_pack_key(&packed, &insert->k, f)) + src = &packed; + + if (!bkey_deleted(&insert->k)) + btree_keys_account_key_add(&b->nr, t - b->set, src); + + if (src->u64s != clobber_u64s) { + u64 *src_p = (u64 *) where->_data + clobber_u64s; + u64 *dst_p = (u64 *) where->_data + src->u64s; + + EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < + (int) clobber_u64s - src->u64s); + + memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); + le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); + set_btree_bset_end(b, t); + } + + memcpy_u64s_small(where, src, + bkeyp_key_u64s(f, src)); + memcpy_u64s(bkeyp_val(f, where), &insert->v, + bkeyp_val_u64s(f, src)); + + if (src->u64s != clobber_u64s) + bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); + + bch2_verify_btree_nr_keys(b); +} + +void bch2_bset_delete(struct btree *b, + struct bkey_packed *where, + unsigned clobber_u64s) +{ + struct bset_tree *t = bset_tree_last(b); + u64 *src_p = (u64 *) where->_data + clobber_u64s; + u64 *dst_p = where->_data; + + bch2_bset_verify_rw_aux_tree(b, t); + + EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); + + memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); + le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); + set_btree_bset_end(b, t); + + bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); +} + +/* Lookup */ + +__flatten +static struct bkey_packed *bset_search_write_set(const struct btree *b, + struct bset_tree *t, + struct bpos *search) +{ + unsigned l = 0, r = t->size; + + while (l + 1 != r) { + unsigned m = (l + r) >> 1; + + if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) + l = m; + else + r = m; + } + + return rw_aux_to_bkey(b, t, l); +} + +static inline void prefetch_four_cachelines(void *p) +{ +#ifdef CONFIG_X86_64 + asm("prefetcht0 (-127 + 64 * 0)(%0);" + "prefetcht0 (-127 + 64 * 1)(%0);" + "prefetcht0 (-127 + 64 * 2)(%0);" + "prefetcht0 (-127 + 64 * 3)(%0);" + : + : "r" (p + 127)); +#else + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + prefetch(p + L1_CACHE_BYTES * 3); +#endif +} + +static inline bool bkey_mantissa_bits_dropped(const struct btree *b, + const struct bkey_float *f, + unsigned idx) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; + + return f->exponent > key_bits_start; +#else + unsigned key_bits_end = high_bit_offset + b->nr_key_bits; + + return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; +#endif +} + +__flatten +static struct bkey_packed *bset_search_tree(const struct btree *b, + const struct bset_tree *t, + const struct bpos *search, + const struct bkey_packed *packed_search) +{ + struct ro_aux_tree *base = ro_aux_tree_base(b, t); + struct bkey_float *f; + struct bkey_packed *k; + unsigned inorder, n = 1, l, r; + int cmp; + + do { + if (likely(n << 4 < t->size)) + prefetch(&base->f[n << 4]); + + f = &base->f[n]; + if (unlikely(f->exponent >= BFLOAT_FAILED)) + goto slowpath; + + l = f->mantissa; + r = bkey_mantissa(packed_search, f, n); + + if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) + goto slowpath; + + n = n * 2 + (l < r); + continue; +slowpath: + k = tree_to_bkey(b, t, n); + cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); + if (!cmp) + return k; + + n = n * 2 + (cmp < 0); + } while (n < t->size); + + inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); + + /* + * n would have been the node we recursed to - the low bit tells us if + * we recursed left or recursed right. + */ + if (likely(!(n & 1))) { + --inorder; + if (unlikely(!inorder)) + return btree_bkey_first(b, t); + + f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; + } + + return cacheline_to_bkey(b, t, inorder, f->key_offset); +} + +static __always_inline __flatten +struct bkey_packed *__bch2_bset_search(struct btree *b, + struct bset_tree *t, + struct bpos *search, + const struct bkey_packed *lossy_packed_search) +{ + + /* + * First, we search for a cacheline, then lastly we do a linear search + * within that cacheline. + * + * To search for the cacheline, there's three different possibilities: + * * The set is too small to have a search tree, so we just do a linear + * search over the whole set. + * * The set is the one we're currently inserting into; keeping a full + * auxiliary search tree up to date would be too expensive, so we + * use a much simpler lookup table to do a binary search - + * bset_search_write_set(). + * * Or we use the auxiliary search tree we constructed earlier - + * bset_search_tree() + */ + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + return btree_bkey_first(b, t); + case BSET_RW_AUX_TREE: + return bset_search_write_set(b, t, search); + case BSET_RO_AUX_TREE: + return bset_search_tree(b, t, search, lossy_packed_search); + default: + BUG(); + } +} + +static __always_inline __flatten +struct bkey_packed *bch2_bset_search_linear(struct btree *b, + struct bset_tree *t, + struct bpos *search, + struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search, + struct bkey_packed *m) +{ + if (lossy_packed_search) + while (m != btree_bkey_last(b, t) && + bkey_iter_cmp_p_or_unp(b, m, + lossy_packed_search, search) < 0) + m = bkey_p_next(m); + + if (!packed_search) + while (m != btree_bkey_last(b, t) && + bkey_iter_pos_cmp(b, m, search) < 0) + m = bkey_p_next(m); + + if (bch2_expensive_debug_checks) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + + BUG_ON(prev && + bkey_iter_cmp_p_or_unp(b, prev, + packed_search, search) >= 0); + } + + return m; +} + +/* Btree node iterator */ + +static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + if (k != end) { + struct btree_node_iter_set *pos; + + btree_node_iter_for_each(iter, pos) + ; + + BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); + *pos = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, end) + }; + } +} + +void bch2_btree_node_iter_push(struct btree_node_iter *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + __bch2_btree_node_iter_push(iter, b, k, end); + bch2_btree_node_iter_sort(iter, b); +} + +noinline __flatten __cold +static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, + struct btree *b, struct bpos *search) +{ + struct bkey_packed *k; + + trace_bkey_pack_pos_fail(search); + + bch2_btree_node_iter_init_from_start(iter, b); + + while ((k = bch2_btree_node_iter_peek(iter, b)) && + bkey_iter_pos_cmp(b, k, search) < 0) + bch2_btree_node_iter_advance(iter, b); +} + +/** + * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a + * given position + * + * @iter: iterator to initialize + * @b: btree node to search + * @search: search key + * + * Main entry point to the lookup code for individual btree nodes: + * + * NOTE: + * + * When you don't filter out deleted keys, btree nodes _do_ contain duplicate + * keys. This doesn't matter for most code, but it does matter for lookups. + * + * Some adjacent keys with a string of equal keys: + * i j k k k k l m + * + * If you search for k, the lookup code isn't guaranteed to return you any + * specific k. The lookup code is conceptually doing a binary search and + * iterating backwards is very expensive so if the pivot happens to land at the + * last k that's what you'll get. + * + * This works out ok, but it's something to be aware of: + * + * - For non extents, we guarantee that the live key comes last - see + * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't + * see will only be deleted keys you don't care about. + * + * - For extents, deleted keys sort last (see the comment at the top of this + * file). But when you're searching for extents, you actually want the first + * key strictly greater than your search key - an extent that compares equal + * to the search key is going to have 0 sectors after the search key. + * + * But this does mean that we can't just search for + * bpos_successor(start_of_range) to get the first extent that overlaps with + * the range we want - if we're unlucky and there's an extent that ends + * exactly where we searched, then there could be a deleted key at the same + * position and we'd get that when we search instead of the preceding extent + * we needed. + * + * So we've got to search for start_of_range, then after the lookup iterate + * past any extents that compare equal to the position we searched for. + */ +__flatten +void bch2_btree_node_iter_init(struct btree_node_iter *iter, + struct btree *b, struct bpos *search) +{ + struct bkey_packed p, *packed_search = NULL; + struct btree_node_iter_set *pos = iter->data; + struct bkey_packed *k[MAX_BSETS]; + unsigned i; + + EBUG_ON(bpos_lt(*search, b->data->min_key)); + EBUG_ON(bpos_gt(*search, b->data->max_key)); + bset_aux_tree_verify(b); + + memset(iter, 0, sizeof(*iter)); + + switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { + case BKEY_PACK_POS_EXACT: + packed_search = &p; + break; + case BKEY_PACK_POS_SMALLER: + packed_search = NULL; + break; + case BKEY_PACK_POS_FAIL: + btree_node_iter_init_pack_failed(iter, b, search); + return; + } + + for (i = 0; i < b->nsets; i++) { + k[i] = __bch2_bset_search(b, b->set + i, search, &p); + prefetch_four_cachelines(k[i]); + } + + for (i = 0; i < b->nsets; i++) { + struct bset_tree *t = b->set + i; + struct bkey_packed *end = btree_bkey_last(b, t); + + k[i] = bch2_bset_search_linear(b, t, search, + packed_search, &p, k[i]); + if (k[i] != end) + *pos++ = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k[i]), + __btree_node_key_to_offset(b, end) + }; + } + + bch2_btree_node_iter_sort(iter, b); +} + +void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, + struct btree *b) +{ + struct bset_tree *t; + + memset(iter, 0, sizeof(*iter)); + + for_each_bset(b, t) + __bch2_btree_node_iter_push(iter, b, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + bch2_btree_node_iter_sort(iter, b); +} + +struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t) +{ + struct btree_node_iter_set *set; + + btree_node_iter_for_each(iter, set) + if (set->end == t->end_offset) + return __btree_node_offset_to_key(b, set->k); + + return btree_bkey_last(b, t); +} + +static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, + struct btree *b, + unsigned first) +{ + bool ret; + + if ((ret = (btree_node_iter_cmp(b, + iter->data[first], + iter->data[first + 1]) > 0))) + swap(iter->data[first], iter->data[first + 1]); + return ret; +} + +void bch2_btree_node_iter_sort(struct btree_node_iter *iter, + struct btree *b) +{ + /* unrolled bubble sort: */ + + if (!__btree_node_iter_set_end(iter, 2)) { + btree_node_iter_sort_two(iter, b, 0); + btree_node_iter_sort_two(iter, b, 1); + } + + if (!__btree_node_iter_set_end(iter, 1)) + btree_node_iter_sort_two(iter, b, 0); +} + +void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, + struct btree_node_iter_set *set) +{ + struct btree_node_iter_set *last = + iter->data + ARRAY_SIZE(iter->data) - 1; + + memmove(&set[0], &set[1], (void *) last - (void *) set); + *last = (struct btree_node_iter_set) { 0, 0 }; +} + +static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) +{ + iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; + + EBUG_ON(iter->data->k > iter->data->end); + + if (unlikely(__btree_node_iter_set_end(iter, 0))) { + /* avoid an expensive memmove call: */ + iter->data[0] = iter->data[1]; + iter->data[1] = iter->data[2]; + iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; + return; + } + + if (__btree_node_iter_set_end(iter, 1)) + return; + + if (!btree_node_iter_sort_two(iter, b, 0)) + return; + + if (__btree_node_iter_set_end(iter, 2)) + return; + + btree_node_iter_sort_two(iter, b, 1); +} + +void bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) +{ + if (bch2_expensive_debug_checks) { + bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_next_check(iter, b); + } + + __bch2_btree_node_iter_advance(iter, b); +} + +/* + * Expensive: + */ +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct btree *b) +{ + struct bkey_packed *k, *prev = NULL; + struct btree_node_iter_set *set; + struct bset_tree *t; + unsigned end = 0; + + if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + + for_each_bset(b, t) { + k = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); + if (k && + (!prev || bkey_iter_cmp(b, k, prev) > 0)) { + prev = k; + end = t->end_offset; + } + } + + if (!prev) + return NULL; + + /* + * We're manually memmoving instead of just calling sort() to ensure the + * prev we picked ends up in slot 0 - sort won't necessarily put it + * there because of duplicate deleted keys: + */ + btree_node_iter_for_each(iter, set) + if (set->end == end) + goto found; + + BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); +found: + BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); + + memmove(&iter->data[1], + &iter->data[0], + (void *) set - (void *) &iter->data[0]); + + iter->data[0].k = __btree_node_key_to_offset(b, prev); + iter->data[0].end = end; + + if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + return prev; +} + +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, + struct btree *b) +{ + struct bkey_packed *prev; + + do { + prev = bch2_btree_node_iter_prev_all(iter, b); + } while (prev && bkey_deleted(prev)); + + return prev; +} + +struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, + struct btree *b, + struct bkey *u) +{ + struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); + + return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; +} + +/* Mergesort */ + +void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) +{ + const struct bset_tree *t; + + for_each_bset(b, t) { + enum bset_aux_tree_type type = bset_aux_tree_type(t); + size_t j; + + stats->sets[type].nr++; + stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * + sizeof(u64); + + if (bset_has_ro_aux_tree(t)) { + stats->floats += t->size - 1; + + for (j = 1; j < t->size; j++) + stats->failed += + bkey_float(b, t, j)->exponent == + BFLOAT_FAILED; + } + } +} + +void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + struct bkey_packed *k) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey uk; + unsigned j, inorder; + + if (!bset_has_ro_aux_tree(t)) + return; + + inorder = bkey_to_cacheline(b, t, k); + if (!inorder || inorder >= t->size) + return; + + j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); + if (k != tree_to_bkey(b, t, j)) + return; + + switch (bkey_float(b, t, j)->exponent) { + case BFLOAT_FAILED: + uk = bkey_unpack_key(b, k); + prt_printf(out, + " failed unpacked at depth %u\n" + "\t", + ilog2(j)); + bch2_bpos_to_text(out, uk.p); + prt_printf(out, "\n"); + break; + } +} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h new file mode 100644 index 000000000000..632c2b8c5460 --- /dev/null +++ b/fs/bcachefs/bset.h @@ -0,0 +1,541 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BSET_H +#define _BCACHEFS_BSET_H + +#include <linux/kernel.h> +#include <linux/types.h> + +#include "bcachefs.h" +#include "bkey.h" +#include "bkey_methods.h" +#include "btree_types.h" +#include "util.h" /* for time_stats */ +#include "vstructs.h" + +/* + * BKEYS: + * + * A bkey contains a key, a size field, a variable number of pointers, and some + * ancillary flag bits. + * + * We use two different functions for validating bkeys, bkey_invalid and + * bkey_deleted(). + * + * The one exception to the rule that ptr_invalid() filters out invalid keys is + * that it also filters out keys of size 0 - these are keys that have been + * completely overwritten. It'd be safe to delete these in memory while leaving + * them on disk, just unnecessary work - so we filter them out when resorting + * instead. + * + * We can't filter out stale keys when we're resorting, because garbage + * collection needs to find them to ensure bucket gens don't wrap around - + * unless we're rewriting the btree node those stale keys still exist on disk. + * + * We also implement functions here for removing some number of sectors from the + * front or the back of a bkey - this is mainly used for fixing overlapping + * extents, by removing the overlapping sectors from the older key. + * + * BSETS: + * + * A bset is an array of bkeys laid out contiguously in memory in sorted order, + * along with a header. A btree node is made up of a number of these, written at + * different times. + * + * There could be many of them on disk, but we never allow there to be more than + * 4 in memory - we lazily resort as needed. + * + * We implement code here for creating and maintaining auxiliary search trees + * (described below) for searching an individial bset, and on top of that we + * implement a btree iterator. + * + * BTREE ITERATOR: + * + * Most of the code in bcache doesn't care about an individual bset - it needs + * to search entire btree nodes and iterate over them in sorted order. + * + * The btree iterator code serves both functions; it iterates through the keys + * in a btree node in sorted order, starting from either keys after a specific + * point (if you pass it a search key) or the start of the btree node. + * + * AUXILIARY SEARCH TREES: + * + * Since keys are variable length, we can't use a binary search on a bset - we + * wouldn't be able to find the start of the next key. But binary searches are + * slow anyways, due to terrible cache behaviour; bcache originally used binary + * searches and that code topped out at under 50k lookups/second. + * + * So we need to construct some sort of lookup table. Since we only insert keys + * into the last (unwritten) set, most of the keys within a given btree node are + * usually in sets that are mostly constant. We use two different types of + * lookup tables to take advantage of this. + * + * Both lookup tables share in common that they don't index every key in the + * set; they index one key every BSET_CACHELINE bytes, and then a linear search + * is used for the rest. + * + * For sets that have been written to disk and are no longer being inserted + * into, we construct a binary search tree in an array - traversing a binary + * search tree in an array gives excellent locality of reference and is very + * fast, since both children of any node are adjacent to each other in memory + * (and their grandchildren, and great grandchildren...) - this means + * prefetching can be used to great effect. + * + * It's quite useful performance wise to keep these nodes small - not just + * because they're more likely to be in L2, but also because we can prefetch + * more nodes on a single cacheline and thus prefetch more iterations in advance + * when traversing this tree. + * + * Nodes in the auxiliary search tree must contain both a key to compare against + * (we don't want to fetch the key from the set, that would defeat the purpose), + * and a pointer to the key. We use a few tricks to compress both of these. + * + * To compress the pointer, we take advantage of the fact that one node in the + * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have + * a function (to_inorder()) that takes the index of a node in a binary tree and + * returns what its index would be in an inorder traversal, so we only have to + * store the low bits of the offset. + * + * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To + * compress that, we take advantage of the fact that when we're traversing the + * search tree at every iteration we know that both our search key and the key + * we're looking for lie within some range - bounded by our previous + * comparisons. (We special case the start of a search so that this is true even + * at the root of the tree). + * + * So we know the key we're looking for is between a and b, and a and b don't + * differ higher than bit 50, we don't need to check anything higher than bit + * 50. + * + * We don't usually need the rest of the bits, either; we only need enough bits + * to partition the key range we're currently checking. Consider key n - the + * key our auxiliary search tree node corresponds to, and key p, the key + * immediately preceding n. The lowest bit we need to store in the auxiliary + * search tree is the highest bit that differs between n and p. + * + * Note that this could be bit 0 - we might sometimes need all 80 bits to do the + * comparison. But we'd really like our nodes in the auxiliary search tree to be + * of fixed size. + * + * The solution is to make them fixed size, and when we're constructing a node + * check if p and n differed in the bits we needed them to. If they don't we + * flag that node, and when doing lookups we fallback to comparing against the + * real key. As long as this doesn't happen to often (and it seems to reliably + * happen a bit less than 1% of the time), we win - even on failures, that key + * is then more likely to be in cache than if we were doing binary searches all + * the way, since we're touching so much less memory. + * + * The keys in the auxiliary search tree are stored in (software) floating + * point, with an exponent and a mantissa. The exponent needs to be big enough + * to address all the bits in the original key, but the number of bits in the + * mantissa is somewhat arbitrary; more bits just gets us fewer failures. + * + * We need 7 bits for the exponent and 3 bits for the key's offset (since keys + * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. + * We need one node per 128 bytes in the btree node, which means the auxiliary + * search trees take up 3% as much memory as the btree itself. + * + * Constructing these auxiliary search trees is moderately expensive, and we + * don't want to be constantly rebuilding the search tree for the last set + * whenever we insert another key into it. For the unwritten set, we use a much + * simpler lookup table - it's just a flat array, so index i in the lookup table + * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing + * within each byte range works the same as with the auxiliary search trees. + * + * These are much easier to keep up to date when we insert a key - we do it + * somewhat lazily; when we shift a key up we usually just increment the pointer + * to it, only when it would overflow do we go to the trouble of finding the + * first key in that range of bytes again. + */ + +enum bset_aux_tree_type { + BSET_NO_AUX_TREE, + BSET_RO_AUX_TREE, + BSET_RW_AUX_TREE, +}; + +#define BSET_TREE_NR_TYPES 3 + +#define BSET_NO_AUX_TREE_VAL (U16_MAX) +#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) + +static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) +{ + switch (t->extra) { + case BSET_NO_AUX_TREE_VAL: + EBUG_ON(t->size); + return BSET_NO_AUX_TREE; + case BSET_RW_AUX_TREE_VAL: + EBUG_ON(!t->size); + return BSET_RW_AUX_TREE; + default: + EBUG_ON(!t->size); + return BSET_RO_AUX_TREE; + } +} + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 256 + +static inline size_t btree_keys_cachelines(const struct btree *b) +{ + return (1U << b->byte_order) / BSET_CACHELINE; +} + +static inline size_t btree_aux_data_bytes(const struct btree *b) +{ + return btree_keys_cachelines(b) * 8; +} + +static inline size_t btree_aux_data_u64s(const struct btree *b) +{ + return btree_aux_data_bytes(b) / sizeof(u64); +} + +#define for_each_bset(_b, _t) \ + for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + +#define bset_tree_for_each_key(_b, _t, _k) \ + for (_k = btree_bkey_first(_b, _t); \ + _k != btree_bkey_last(_b, _t); \ + _k = bkey_p_next(_k)) + +static inline bool bset_has_ro_aux_tree(const struct bset_tree *t) +{ + return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; +} + +static inline bool bset_has_rw_aux_tree(struct bset_tree *t) +{ + return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; +} + +static inline void bch2_bset_set_no_aux_tree(struct btree *b, + struct bset_tree *t) +{ + BUG_ON(t < b->set); + + for (; t < b->set + ARRAY_SIZE(b->set); t++) { + t->size = 0; + t->extra = BSET_NO_AUX_TREE_VAL; + t->aux_data_offset = U16_MAX; + } +} + +static inline void btree_node_set_format(struct btree *b, + struct bkey_format f) +{ + int len; + + b->format = f; + b->nr_key_bits = bkey_format_key_bits(&f); + + len = bch2_compile_bkey_format(&b->format, b->aux_data); + BUG_ON(len < 0 || len > U8_MAX); + + b->unpack_fn_len = len; + + bch2_bset_set_no_aux_tree(b, b->set); +} + +static inline struct bset *bset_next_set(struct btree *b, + unsigned block_bytes) +{ + struct bset *i = btree_bset_last(b); + + EBUG_ON(!is_power_of_2(block_bytes)); + + return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); +} + +void bch2_btree_keys_init(struct btree *); + +void bch2_bset_init_first(struct btree *, struct bset *); +void bch2_bset_init_next(struct bch_fs *, struct btree *, + struct btree_node_entry *); +void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); + +void bch2_bset_insert(struct btree *, struct btree_node_iter *, + struct bkey_packed *, struct bkey_i *, unsigned); +void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); + +/* Bkey utility code */ + +/* packed or unpacked */ +static inline int bkey_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, + const struct bpos *r) +{ + EBUG_ON(r_packed && !bkey_packed(r_packed)); + + if (unlikely(!bkey_packed(l))) + return bpos_cmp(packed_to_bkey_c(l)->p, *r); + + if (likely(r_packed)) + return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); + + return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); +} + +static inline struct bset_tree * +bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) +{ + unsigned offset = __btree_node_key_to_offset(b, k); + struct bset_tree *t; + + for_each_bset(b, t) + if (offset <= t->end_offset) { + EBUG_ON(offset < btree_bkey_first_offset(t)); + return t; + } + + BUG(); +} + +struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); + +struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, + struct bkey_packed *, unsigned); + +static inline struct bkey_packed * +bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) +{ + return bch2_bkey_prev_filter(b, t, k, 0); +} + +static inline struct bkey_packed * +bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) +{ + return bch2_bkey_prev_filter(b, t, k, 1); +} + +/* Btree key iteration */ + +void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); +void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, + struct bpos *); +void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, + struct btree *); +struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, + struct btree *, + struct bset_tree *); + +void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); +void bch2_btree_node_iter_set_drop(struct btree_node_iter *, + struct btree_node_iter_set *); +void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); + +#define btree_node_iter_for_each(_iter, _set) \ + for (_set = (_iter)->data; \ + _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ + (_set)->k != (_set)->end; \ + _set++) + +static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, + unsigned i) +{ + return iter->data[i].k == iter->data[i].end; +} + +static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) +{ + return __btree_node_iter_set_end(iter, 0); +} + +/* + * When keys compare equal, deleted keys compare first: + * + * XXX: only need to compare pointers for keys that are both within a + * btree_node_iterator - we need to break ties for prev() to work correctly + */ +static inline int bkey_iter_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) + ?: cmp_int(l, r); +} + +static inline int btree_node_iter_cmp(const struct btree *b, + struct btree_node_iter_set l, + struct btree_node_iter_set r) +{ + return bkey_iter_cmp(b, + __btree_node_offset_to_key(b, l.k), + __btree_node_offset_to_key(b, r.k)); +} + +/* These assume r (the search key) is not a deleted key: */ +static inline int bkey_iter_pos_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + return bkey_cmp_left_packed(b, l, r) + ?: -((int) bkey_deleted(l)); +} + +static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, + const struct bpos *r) +{ + return bkey_cmp_p_or_unp(b, l, r_packed, r) + ?: -((int) bkey_deleted(l)); +} + +static inline struct bkey_packed * +__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, + struct btree *b) +{ + return __btree_node_offset_to_key(b, iter->data->k); +} + +static inline struct bkey_packed * +bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) +{ + return !bch2_btree_node_iter_end(iter) + ? __btree_node_offset_to_key(b, iter->data->k) + : NULL; +} + +static inline struct bkey_packed * +bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) +{ + struct bkey_packed *k; + + while ((k = bch2_btree_node_iter_peek_all(iter, b)) && + bkey_deleted(k)) + bch2_btree_node_iter_advance(iter, b); + + return k; +} + +static inline struct bkey_packed * +bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) +{ + struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); + + if (ret) + bch2_btree_node_iter_advance(iter, b); + + return ret; +} + +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, + struct btree *); +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, + struct btree *); + +struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, + struct btree *, + struct bkey *); + +#define for_each_btree_node_key(b, k, iter) \ + for (bch2_btree_node_iter_init_from_start((iter), (b)); \ + (k = bch2_btree_node_iter_peek((iter), (b))); \ + bch2_btree_node_iter_advance(iter, b)) + +#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ + for (bch2_btree_node_iter_init_from_start((iter), (b)); \ + (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ + bch2_btree_node_iter_advance(iter, b)) + +/* Accounting: */ + +static inline void btree_keys_account_key(struct btree_nr_keys *n, + unsigned bset, + struct bkey_packed *k, + int sign) +{ + n->live_u64s += k->u64s * sign; + n->bset_u64s[bset] += k->u64s * sign; + + if (bkey_packed(k)) + n->packed_keys += sign; + else + n->unpacked_keys += sign; +} + +static inline void btree_keys_account_val_delta(struct btree *b, + struct bkey_packed *k, + int delta) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, k); + + b->nr.live_u64s += delta; + b->nr.bset_u64s[t - b->set] += delta; +} + +#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ + btree_keys_account_key(_nr, _bset_idx, _k, 1) +#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ + btree_keys_account_key(_nr, _bset_idx, _k, -1) + +#define btree_account_key_add(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) +#define btree_account_key_drop(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) + +struct bset_stats { + struct { + size_t nr, bytes; + } sets[BSET_TREE_NR_TYPES]; + + size_t floats; + size_t failed; +}; + +void bch2_btree_keys_stats(const struct btree *, struct bset_stats *); +void bch2_bfloat_to_text(struct printbuf *, struct btree *, + struct bkey_packed *); + +/* Debug stuff */ + +void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); +void bch2_dump_btree_node(struct bch_fs *, struct btree *); +void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *); +void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); +void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, + struct bkey_packed *, unsigned); + +#else + +static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} +static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) {} +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) {} +#endif + +static inline void bch2_verify_btree_nr_keys(struct btree *b) +{ + if (bch2_debug_check_btree_accounting) + __bch2_verify_btree_nr_keys(b); +} + +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 index 000000000000..79495cd7a794 --- /dev/null +++ b/fs/bcachefs/btree_cache.c @@ -0,0 +1,1215 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "debug.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "trace.h" + +#include <linux/prefetch.h> +#include <linux/sched/mm.h> + +const char * const bch2_btree_node_flags[] = { +#define x(f) #f, + BTREE_FLAGS() +#undef x + NULL +}; + +void bch2_recalc_btree_reserve(struct bch_fs *c) +{ + unsigned i, reserve = 16; + + if (!c->btree_roots_known[0].b) + reserve += 8; + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->b) + reserve += min_t(unsigned, 1, r->b->c.level) * 8; + } + + c->btree_cache.reserve = reserve; +} + +static inline unsigned btree_cache_can_free(struct btree_cache *bc) +{ + return max_t(int, 0, bc->used - bc->reserve); +} + +static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) +{ + if (b->c.lock.readers) + list_move(&b->list, &bc->freed_pcpu); + else + list_move(&b->list, &bc->freed_nonpcpu); +} + +static void btree_node_data_free(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + EBUG_ON(btree_node_write_in_flight(b)); + + clear_btree_node_just_written(b); + + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; +#ifdef __KERNEL__ + kvfree(b->aux_data); +#else + munmap(b->aux_data, btree_aux_data_bytes(b)); +#endif + b->aux_data = NULL; + + bc->used--; + + btree_node_to_freedlist(bc, b); +} + +static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct btree *b = obj; + const u64 *v = arg->key; + + return b->hash_val == *v ? 0 : 1; +} + +static const struct rhashtable_params bch_btree_cache_params = { + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, +}; + +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +{ + BUG_ON(b->data || b->aux_data); + + b->data = kvpmalloc(btree_bytes(c), gfp); + if (!b->data) + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; +#ifdef __KERNEL__ + b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); +#else + b->aux_data = mmap(NULL, btree_aux_data_bytes(b), + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); + if (b->aux_data == MAP_FAILED) + b->aux_data = NULL; +#endif + if (!b->aux_data) { + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + } + + return 0; +} + +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) +{ + struct btree *b; + + b = kzalloc(sizeof(struct btree), gfp); + if (!b) + return NULL; + + bkey_btree_ptr_init(&b->key); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); + b->byte_order = ilog2(btree_bytes(c)); + return b; +} + +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + b = __btree_node_mem_alloc(c, GFP_KERNEL); + if (!b) + return NULL; + + if (btree_node_data_alloc(c, b, GFP_KERNEL)) { + kfree(b); + return NULL; + } + + bch2_btree_lock_init(&b->c, 0); + + bc->used++; + list_add(&b->list, &bc->freeable); + return b; +} + +/* Btree in memory cache - hash table */ + +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + + BUG_ON(ret); + + /* Cause future lookups for this node to fail: */ + b->hash_val = 0; +} + +int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(b->hash_val); + b->hash_val = btree_ptr_hash_val(&b->key); + + return rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); +} + +int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, + unsigned level, enum btree_id id) +{ + int ret; + + b->c.level = level; + b->c.btree_id = id; + + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); + if (!ret) + list_add_tail(&b->list, &bc->live); + mutex_unlock(&bc->lock); + + return ret; +} + +__flatten +static inline struct btree *btree_cache_find(struct btree_cache *bc, + const struct bkey_i *k) +{ + u64 v = btree_ptr_hash_val(k); + + return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); +} + +/* + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +{ + struct btree_cache *bc = &c->btree_cache; + int ret = 0; + + lockdep_assert_held(&bc->lock); +wait_on_io: + if (b->flags & ((1U << BTREE_NODE_dirty)| + (1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + return -BCH_ERR_ENOMEM_btree_node_reclaim; + + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } + + if (!six_trylock_intent(&b->c.lock)) + return -BCH_ERR_ENOMEM_btree_node_reclaim; + + if (!six_trylock_write(&b->c.lock)) + goto out_unlock_intent; + + /* recheck under lock */ + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + goto out_unlock; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + + if (btree_node_noevict(b) || + btree_node_write_blocked(b) || + btree_node_will_make_reachable(b)) + goto out_unlock; + + if (btree_node_dirty(b)) { + if (!flush) + goto out_unlock; + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ + if (bch2_verify_btree_ondisk) + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); + else + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } +out: + if (b->hash_val && !ret) + trace_and_count(c, btree_cache_reap, c, b); + return ret; +out_unlock: + six_unlock_write(&b->c.lock); +out_unlock_intent: + six_unlock_intent(&b->c.lock); + ret = -BCH_ERR_ENOMEM_btree_node_reclaim; + goto out; +} + +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) +{ + return __btree_node_reclaim(c, b, false); +} + +static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) +{ + return __btree_node_reclaim(c, b, true); +} + +static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = shrink->private_data; + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *t; + unsigned long nr = sc->nr_to_scan; + unsigned long can_free = 0; + unsigned long freed = 0; + unsigned long touched = 0; + unsigned i, flags; + unsigned long ret = SHRINK_STOP; + bool trigger_writes = atomic_read(&bc->dirty) + nr >= + bc->used * 3 / 4; + + if (bch2_btree_shrinker_disabled) + return SHRINK_STOP; + + mutex_lock(&bc->lock); + flags = memalloc_nofs_save(); + + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ + can_free = btree_cache_can_free(bc); + nr = min_t(unsigned long, nr, can_free); + + i = 0; + list_for_each_entry_safe(b, t, &bc->freeable, list) { + /* + * Leave a few nodes on the freeable list, so that a btree split + * won't have to hit the system allocator: + */ + if (++i <= 3) + continue; + + touched++; + + if (touched >= nr) + goto out; + + if (!btree_node_reclaim(c, b)) { + btree_node_data_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + freed++; + } + } +restart: + list_for_each_entry_safe(b, t, &bc->live, list) { + touched++; + + if (btree_node_accessed(b)) { + clear_btree_node_accessed(b); + } else if (!btree_node_reclaim(c, b)) { + freed++; + btree_node_data_free(c, b); + + bch2_btree_node_hash_remove(bc, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + + if (freed == nr) + goto out_rotate; + } else if (trigger_writes && + btree_node_dirty(b) && + !btree_node_will_make_reachable(b) && + !btree_node_write_blocked(b) && + six_trylock_read(&b->c.lock)) { + list_move(&bc->live, &b->list); + mutex_unlock(&bc->lock); + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); + six_unlock_read(&b->c.lock); + if (touched >= nr) + goto out_nounlock; + mutex_lock(&bc->lock); + goto restart; + } + + if (touched >= nr) + break; + } +out_rotate: + if (&t->list != &bc->live) + list_move_tail(&bc->live, &t->list); +out: + mutex_unlock(&bc->lock); +out_nounlock: + ret = freed; + memalloc_nofs_restore(flags); + trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); + return ret; +} + +static unsigned long bch2_btree_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = shrink->private_data; + struct btree_cache *bc = &c->btree_cache; + + if (bch2_btree_shrinker_disabled) + return 0; + + return btree_cache_can_free(bc); +} + +void bch2_fs_btree_cache_exit(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + unsigned i, flags; + + shrinker_free(bc->shrink); + + /* vfree() can allocate memory: */ + flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + + if (c->verify_data) + list_move(&c->verify_data->list, &bc->live); + + kvpfree(c->verify_ondisk, btree_bytes(c)); + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->b) + list_add(&r->b->list, &bc->live); + } + + list_splice(&bc->freeable, &bc->live); + + while (!list_empty(&bc->live)) { + b = list_first_entry(&bc->live, struct btree, list); + + BUG_ON(btree_node_read_in_flight(b) || + btree_node_write_in_flight(b)); + + btree_node_data_free(c, b); + } + + BUG_ON(!bch2_journal_error(&c->journal) && + atomic_read(&c->btree_cache.dirty)); + + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + + while (!list_empty(&bc->freed_nonpcpu)) { + b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_del(&b->list); + six_lock_exit(&b->c.lock); + kfree(b); + } + + mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); + + if (bc->table_init_done) + rhashtable_destroy(&bc->table); +} + +int bch2_fs_btree_cache_init(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct shrinker *shrink; + unsigned i; + int ret = 0; + + ret = rhashtable_init(&bc->table, &bch_btree_cache_params); + if (ret) + goto err; + + bc->table_init_done = true; + + bch2_recalc_btree_reserve(c); + + for (i = 0; i < bc->reserve; i++) + if (!__bch2_btree_node_mem_alloc(c)) + goto err; + + list_splice_init(&bc->live, &bc->freeable); + + mutex_init(&c->verify_lock); + + shrink = shrinker_alloc(0, "%s-btree_cache", c->name); + if (!shrink) + goto err; + bc->shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 4; + shrink->private_data = c; + shrinker_register(shrink); + + return 0; +err: + return -BCH_ERR_ENOMEM_fs_btree_cache_init; +} + +void bch2_fs_btree_cache_init_early(struct btree_cache *bc) +{ + mutex_init(&bc->lock); + INIT_LIST_HEAD(&bc->live); + INIT_LIST_HEAD(&bc->freeable); + INIT_LIST_HEAD(&bc->freed_pcpu); + INIT_LIST_HEAD(&bc->freed_nonpcpu); +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + + if (bc->alloc_lock == current) { + trace_and_count(c, btree_cache_cannibalize_unlock, c); + bc->alloc_lock = NULL; + closure_wake_up(&bc->alloc_wait); + } +} + +int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) +{ + struct btree_cache *bc = &c->btree_cache; + struct task_struct *old; + + old = cmpxchg(&bc->alloc_lock, NULL, current); + if (old == NULL || old == current) + goto success; + + if (!cl) { + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; + } + + closure_wait(&bc->alloc_wait, cl); + + /* Try again, after adding ourselves to waitlist */ + old = cmpxchg(&bc->alloc_lock, NULL, current); + if (old == NULL || old == current) { + /* We raced */ + closure_wake_up(&bc->alloc_wait); + goto success; + } + + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + return -BCH_ERR_btree_cache_cannibalize_lock_blocked; + +success: + trace_and_count(c, btree_cache_cannibalize_lock, c); + return 0; +} + +static struct btree *btree_node_cannibalize(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + list_for_each_entry_reverse(b, &bc->live, list) + if (!btree_node_reclaim(c, b)) + return b; + + while (1) { + list_for_each_entry_reverse(b, &bc->live, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; + + /* + * Rare case: all nodes were intent-locked. + * Just busy-wait. + */ + WARN_ONCE(1, "btree cache cannibalize failed\n"); + cond_resched(); + } +} + +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct list_head *freed = pcpu_read_locks + ? &bc->freed_pcpu + : &bc->freed_nonpcpu; + struct btree *b, *b2; + u64 start_time = local_clock(); + unsigned flags; + + flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + + /* + * We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, freed, list) + if (!btree_node_reclaim(c, b)) { + list_del_init(&b->list); + goto got_node; + } + + b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); + if (!b) { + mutex_unlock(&bc->lock); + bch2_trans_unlock(trans); + b = __btree_node_mem_alloc(c, GFP_KERNEL); + if (!b) + goto err; + mutex_lock(&bc->lock); + } + + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); + + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); +got_node: + + /* + * btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b2, &bc->freeable, list) + if (!btree_node_reclaim(c, b2)) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + goto got_mem; + } + + mutex_unlock(&bc->lock); + + if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + bch2_trans_unlock(trans); + if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) + goto err; + } + + mutex_lock(&bc->lock); + bc->used++; +got_mem: + mutex_unlock(&bc->lock); + + BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_dirty(b)); + BUG_ON(btree_node_write_in_flight(b)); +out: + b->flags = 0; + b->written = 0; + b->nsets = 0; + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; + bch2_btree_keys_init(b); + set_btree_node_accessed(b); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + start_time); + + memalloc_nofs_restore(flags); + return b; +err: + mutex_lock(&bc->lock); + + /* Try to cannibalize another cached btree node: */ + if (bc->alloc_lock == current) { + b2 = btree_node_cannibalize(c); + clear_btree_node_just_written(b2); + bch2_btree_node_hash_remove(bc, b2); + + if (b) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + } else { + b = b2; + list_del_init(&b->list); + } + + mutex_unlock(&bc->lock); + + trace_and_count(c, btree_cache_cannibalize, c); + goto out; + } + + mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); +} + +/* Slowpath, don't want it inlined into btree_iter_traverse() */ +static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, + struct btree_path *path, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level, + enum six_lock_type lock_type, + bool sync) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + u32 seq; + + BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + /* + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ + if (path && !bch2_btree_node_relock(trans, path, level + 1)) { + trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); + } + + b = bch2_btree_node_mem_alloc(trans, level != 0); + + if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + trans->memory_allocation_failure = true; + trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); + } + + if (IS_ERR(b)) + return b; + + /* + * Btree nodes read in from disk should not have the accessed bit set + * initially, so that linear scans don't thrash the cache: + */ + clear_btree_node_accessed(b); + + bkey_copy(&b->key, k); + if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { + /* raced with another fill: */ + + /* mark as unhashed... */ + b->hash_val = 0; + + mutex_lock(&bc->lock); + list_add(&b->list, &bc->freeable); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + return NULL; + } + + set_btree_node_read_in_flight(b); + + six_unlock_write(&b->c.lock); + seq = six_lock_seq(&b->c.lock); + six_unlock_intent(&b->c.lock); + + /* Unlock before doing IO: */ + if (path && sync) + bch2_trans_unlock_noassert(trans); + + bch2_btree_node_read(c, b, sync); + + if (!sync) + return NULL; + + if (path) { + int ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + BUG_ON(!trans->restarted); + return ERR_PTR(ret); + } + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) { + if (path) + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + } + + return b; +} + +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) +{ + struct printbuf buf = PRINTBUF; + + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + return; + + prt_printf(&buf, + "btree node header doesn't match ptr\n" + "btree %s level %u\n" + "ptr: ", + bch2_btree_id_str(b->c.btree_id), b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_printf(&buf, "\nheader: btree %s level %llu\n" + "min ", + bch2_btree_id_str(BTREE_NODE_ID(b->data)), + BTREE_NODE_LEVEL(b->data)); + bch2_bpos_to_text(&buf, b->data->min_key); + + prt_printf(&buf, "\nmax "); + bch2_bpos_to_text(&buf, b->data->max_key); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); +} + +static inline void btree_check_header(struct bch_fs *c, struct btree *b) +{ + if (b->c.btree_id != BTREE_NODE_ID(b->data) || + b->c.level != BTREE_NODE_LEVEL(b->data) || + !bpos_eq(b->data->max_key, b->key.k.p) || + (b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) + btree_bad_header(c, b); +} + +static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + bool need_relock = false; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + /* + * We must have the parent locked to call bch2_btree_node_fill(), + * else we could read in a btree node from disk that's been + * freed: + */ + b = bch2_btree_node_fill(trans, path, k, path->btree_id, + level, lock_type, true); + need_relock = true; + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b)) + return b; + } else { + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); + if (bch2_btree_node_relock(trans, path, level + 1)) + goto retry; + + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + } + + if (unlikely(btree_node_read_in_flight(b))) { + u32 seq = six_lock_seq(&b->c.lock); + + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(trans); + need_relock = true; + + bch2_btree_node_wait_on_read(b); + + /* + * should_be_locked is not set on this path yet, so we need to + * relock it specifically: + */ + if (!six_relock_type(&b->c.lock, lock_type, seq)) + goto retry; + } + + if (unlikely(need_relock)) { + ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(ret); + } + } + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->c.btree_id != path->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); + + return b; +} + +/** + * bch2_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * @trans: btree transaction object + * @path: btree_path being traversed + * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) + * @level: level of btree node being looked up (0 == leaf node) + * @lock_type: SIX_LOCK_read or SIX_LOCK_intent + * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) + * + * The btree node will have either a read or a write lock held, depending on + * the @write parameter. + * + * Returns: btree node or ERR_PTR() + */ +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bset_tree *t; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + + /* + * Check b->hash_val _before_ calling btree_node_lock() - this might not + * be the node we want anymore, and trying to lock the wrong node could + * cause an unneccessary transaction restart: + */ + if (unlikely(!c->opts.btree_node_mem_ptr_optimization || + !b || + b->hash_val != btree_ptr_hash_val(k))) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); + if (bch2_btree_node_relock(trans, path, level + 1)) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); + } + + if (unlikely(btree_node_read_in_flight(b))) { + six_unlock_type(&b->c.lock, lock_type); + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + } + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->c.btree_id != path->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); + + return b; +} + +struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level, + bool nofill) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + if (c->opts.btree_node_mem_ptr_optimization) { + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; + } +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + if (nofill) + goto out; + + b = bch2_btree_node_fill(trans, NULL, k, btree_id, + level, SIX_LOCK_read, true); + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b) && + !bch2_btree_cache_cannibalize_lock(c, NULL)) + goto retry; + + if (IS_ERR(b)) + goto out; + } else { +lock_node: + ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.btree_id != btree_id || + b->c.level != level)) { + six_unlock_read(&b->c.lock); + goto retry; + } + } + + /* XXX: waiting on IO with btree locks held: */ + __bch2_btree_node_wait_on_read(b); + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_read(&b->c.lock); + b = ERR_PTR(-EIO); + goto out; + } + + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); +out: + bch2_btree_cache_cannibalize_unlock(c); + return b; +} + +int bch2_btree_node_prefetch(struct btree_trans *trans, + struct btree_path *path, + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_cache_find(bc, k); + if (b) + return 0; + + b = bch2_btree_node_fill(trans, path, k, btree_id, + level, SIX_LOCK_read, false); + return PTR_ERR_OR_ZERO(b); +} + +void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + b = btree_cache_find(bc, k); + if (!b) + return; +wait_on_io: + /* not allowed to wait on io with btree locks held: */ + + /* XXX we're called from btree_gc which will be holding other btree + * nodes locked + */ + __bch2_btree_node_wait_on_read(b); + __bch2_btree_node_wait_on_write(b); + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + + if (btree_node_dirty(b)) { + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + + BUG_ON(btree_node_dirty(b)); + + mutex_lock(&bc->lock); + btree_node_data_free(c, b); + bch2_btree_node_hash_remove(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + +const char *bch2_btree_id_str(enum btree_id btree) +{ + return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; +} + +void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) +{ + prt_printf(out, "%s level %u/%u\n ", + bch2_btree_id_str(b->c.btree_id), + b->c.level, + bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) +{ + struct bset_stats stats; + + memset(&stats, 0, sizeof(stats)); + + bch2_btree_keys_stats(b, &stats); + + prt_printf(out, "l %u ", b->c.level); + bch2_bpos_to_text(out, b->data->min_key); + prt_printf(out, " - "); + bch2_bpos_to_text(out, b->data->max_key); + prt_printf(out, ":\n" + " ptrs: "); + bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + prt_newline(out); + + prt_printf(out, + " format: "); + bch2_bkey_format_to_text(out, &b->format); + + prt_printf(out, + " unpack fn len: %u\n" + " bytes used %zu/%zu (%zu%% full)\n" + " sib u64s: %u, %u (merge threshold %u)\n" + " nr packed keys %u\n" + " nr unpacked keys %u\n" + " floats %zu\n" + " failed unpacked %zu\n", + b->unpack_fn_len, + b->nr.live_u64s * sizeof(u64), + btree_bytes(c) - sizeof(struct btree_node), + b->nr.live_u64s * 100 / btree_max_u64s(c), + b->sib_u64s[0], + b->sib_u64s[1], + c->btree_foreground_merge_threshold, + b->nr.packed_keys, + b->nr.unpacked_keys, + stats.floats, + stats.failed); +} + +void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) +{ + prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); + prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 index 000000000000..cfb80b201d61 --- /dev/null +++ b/fs/bcachefs/btree_cache.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H + +#include "bcachefs.h" +#include "btree_types.h" +#include "bkey_methods.h" + +extern const char * const bch2_btree_node_flags[]; + +struct btree_iter; + +void bch2_recalc_btree_reserve(struct bch_fs *); + +void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); +int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); +int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, + unsigned, enum btree_id); + +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); +int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); + +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); + +struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, + const struct bkey_i *, unsigned, + enum six_lock_type, unsigned long); + +struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, + enum btree_id, unsigned, bool); + +int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *, + const struct bkey_i *, enum btree_id, unsigned); + +void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); + +void bch2_fs_btree_cache_exit(struct bch_fs *); +int bch2_fs_btree_cache_init(struct bch_fs *); +void bch2_fs_btree_cache_init_early(struct btree_cache *); + +static inline u64 btree_ptr_hash_val(const struct bkey_i *k) +{ + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); + case KEY_TYPE_btree_ptr_v2: + /* + * The cast/deref is only necessary to avoid sparse endianness + * warnings: + */ + return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); + default: + return 0; + } +} + +static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) +{ + return k->k.type == KEY_TYPE_btree_ptr_v2 + ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr + : NULL; +} + +/* is btree node in hash table? */ +static inline bool btree_node_hashed(struct btree *b) +{ + return b->hash_val != 0; +} + +#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ + for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ + &(_c)->btree_cache.table), \ + _iter = 0; _iter < (_tbl)->size; _iter++) \ + rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) + +static inline size_t btree_bytes(struct bch_fs *c) +{ + return c->opts.btree_node_size; +} + +static inline size_t btree_max_u64s(struct bch_fs *c) +{ + return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); +} + +static inline size_t btree_pages(struct bch_fs *c) +{ + return btree_bytes(c) / PAGE_SIZE; +} + +static inline unsigned btree_blocks(struct bch_fs *c) +{ + return btree_sectors(c) >> c->block_bits; +} + +#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) + +#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) +#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) + +static inline unsigned btree_id_nr_alive(struct bch_fs *c) +{ + return BTREE_ID_NR + c->btree_roots_extra.nr; +} + +static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) +{ + if (likely(id < BTREE_ID_NR)) { + return &c->btree_roots_known[id]; + } else { + unsigned idx = id - BTREE_ID_NR; + + EBUG_ON(idx >= c->btree_roots_extra.nr); + return &c->btree_roots_extra.data[idx]; + } +} + +static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) +{ + return bch2_btree_id_root(c, b->c.btree_id)->b; +} + +const char *bch2_btree_id_str(enum btree_id); +void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); +void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 index 000000000000..30ab78a24517 --- /dev/null +++ b/fs/bcachefs/btree_gc.c @@ -0,0 +1,2146 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright (C) 2014 Datera Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "bkey_buf.h" +#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_gc.h" +#include "buckets.h" +#include "clock.h" +#include "debug.h" +#include "ec.h" +#include "error.h" +#include "extents.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "recovery.h" +#include "reflink.h" +#include "replicas.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/preempt.h> +#include <linux/rcupdate.h> +#include <linux/sched/task.h> + +#define DROP_THIS_NODE 10 +#define DROP_PREV_NODE 11 + +static bool should_restart_for_topology_repair(struct bch_fs *c) +{ + return c->opts.fix_errors != FSCK_FIX_no && + !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); +} + +static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + preempt_disable(); + write_seqcount_begin(&c->gc_pos_lock); + c->gc_pos = new_pos; + write_seqcount_end(&c->gc_pos_lock); + preempt_enable(); +} + +static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + __gc_pos_set(c, new_pos); +} + +/* + * Missing: if an interior btree node is empty, we need to do something - + * perhaps just kill it + */ +static int bch2_gc_check_topology(struct bch_fs *c, + struct btree *b, + struct bkey_buf *prev, + struct bkey_buf cur, + bool is_last) +{ + struct bpos node_start = b->data->min_key; + struct bpos node_end = b->data->max_key; + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start + : bpos_successor(prev->k->k.p); + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + + if (!bpos_eq(expected_start, bp->v.min_key)) { + bch2_topology_error(c); + + if (bkey_deleted(&prev->k->k)) { + prt_printf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, node_start); + } else { + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); + } + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); + + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + btree_node_topology_bad_min_key, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } + } + } + + if (is_last && !bpos_eq(cur.k->k.p, node_end)) { + bch2_topology_error(c); + + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); + bch2_bpos_to_text(&buf2, node_end); + + if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT, + btree_node_topology_bad_max_key, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf) && + should_restart_for_topology_repair(c)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } + } + + bch2_bkey_buf_copy(prev, c, cur.k); +err: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) +{ + switch (b->key.k.type) { + case KEY_TYPE_btree_ptr: { + struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); + + dst->k.p = src->k.p; + dst->v.mem_ptr = 0; + dst->v.seq = b->data->keys.seq; + dst->v.sectors_written = 0; + dst->v.flags = 0; + dst->v.min_key = b->data->min_key; + set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); + memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); + break; + } + case KEY_TYPE_btree_ptr_v2: + bkey_copy(&dst->k_i, &b->key); + break; + default: + BUG(); + } +} + +static void bch2_btree_node_update_key_early(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + +static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) +{ + struct bkey_i_btree_ptr_v2 *new; + int ret; + + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) + return -BCH_ERR_ENOMEM_gc_repair_key; + + btree_ptr_to_v2(b, new); + b->data->min_key = new_min; + new->v.min_key = new_min; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; + } + + bch2_btree_node_drop_keys_outside_node(b); + bkey_copy(&b->key, &new->k_i); + return 0; +} + +static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) +{ + struct bkey_i_btree_ptr_v2 *new; + int ret; + + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); + if (ret) + return ret; + + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) + return -BCH_ERR_ENOMEM_gc_repair_key; + + btree_ptr_to_v2(b, new); + b->data->max_key = new_max; + new->k.p = new_max; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; + } + + bch2_btree_node_drop_keys_outside_node(b); + + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, &new->k_i); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + return 0; +} + +static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, + struct btree *prev, struct btree *cur) +{ + struct bpos expected_start = !prev + ? b->data->min_key + : bpos_successor(prev->key.k.p); + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (!prev) { + prt_printf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, b->data->min_key); + } else { + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + } + + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); + + if (prev && + bpos_gt(expected_start, cur->data->min_key) && + BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { + /* cur overwrites prev: */ + + if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, + cur->data->min_key), c, + btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node at btree %s level %u:\n" + " node %s\n" + " next %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) { + ret = DROP_PREV_NODE; + goto out; + } + + if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, + bpos_predecessor(cur->data->min_key)), c, + btree_node_topology_bad_max_key, + "btree node with incorrect max_key at btree %s level %u:\n" + " node %s\n" + " next %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } else { + /* prev overwrites cur: */ + + if (mustfix_fsck_err_on(bpos_ge(expected_start, + cur->data->max_key), c, + btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) { + ret = DROP_THIS_NODE; + goto out; + } + + if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, + btree_node_topology_bad_min_key, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) + ret = set_node_min(c, cur, expected_start); + } +out: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static int btree_repair_node_end(struct bch_fs *c, struct btree *b, + struct btree *child) +{ + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); + bch2_bpos_to_text(&buf2, b->key.k.p); + + if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, + btree_node_topology_bad_max_key, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_id_str(b->c.btree_id), b->c.level, + buf1.buf, buf2.buf)) { + ret = set_node_max(c, child, b->key.k.p); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf prev_k, cur_k; + struct btree *prev = NULL, *cur = NULL; + bool have_child, dropped_children = false; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!b->c.level) + return 0; +again: + prev = NULL; + have_child = dropped_children = false; + bch2_bkey_buf_init(&prev_k); + bch2_bkey_buf_init(&cur_k); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + + bch2_btree_and_journal_iter_advance(&iter); + bch2_bkey_buf_reassemble(&cur_k, c, k); + + cur = bch2_btree_node_get_noiter(trans, cur_k.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(cur); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); + + if (mustfix_fsck_err_on(ret == -EIO, c, + btree_node_unreadable, + "Topology repair: unreadable btree node at btree %s level %u:\n" + " %s", + bch2_btree_id_str(b->c.btree_id), + b->c.level - 1, + buf.buf)) { + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + if (ret) { + bch_err_msg(c, ret, "getting btree node"); + break; + } + + ret = btree_repair_node_boundaries(c, b, prev, cur); + + if (ret == DROP_THIS_NODE) { + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + if (prev) + six_unlock_read(&prev->c.lock); + prev = NULL; + + if (ret == DROP_PREV_NODE) { + bch2_btree_node_evict(trans, prev_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, prev_k.k->k.p); + if (ret) + break; + + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); + goto again; + } else if (ret) + break; + + prev = cur; + cur = NULL; + bch2_bkey_buf_copy(&prev_k, c, cur_k.k); + } + + if (!ret && !IS_ERR_OR_NULL(prev)) { + BUG_ON(cur); + ret = btree_repair_node_end(c, b, prev); + } + + if (!IS_ERR_OR_NULL(prev)) + six_unlock_read(&prev->c.lock); + prev = NULL; + if (!IS_ERR_OR_NULL(cur)) + six_unlock_read(&cur->c.lock); + cur = NULL; + + if (ret) + goto err; + + bch2_btree_and_journal_iter_exit(&iter); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_buf_reassemble(&cur_k, c, k); + bch2_btree_and_journal_iter_advance(&iter); + + cur = bch2_btree_node_get_noiter(trans, cur_k.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(cur); + + if (ret) { + bch_err_msg(c, ret, "getting btree node"); + goto err; + } + + ret = bch2_btree_repair_topology_recurse(trans, cur); + six_unlock_read(&cur->c.lock); + cur = NULL; + + if (ret == DROP_THIS_NODE) { + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + dropped_children = true; + } + + if (ret) + goto err; + + have_child = true; + } + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (mustfix_fsck_err_on(!have_child, c, + btree_node_topology_interior_node_empty, + "empty interior btree node at btree %s level %u\n" + " %s", + bch2_btree_id_str(b->c.btree_id), + b->c.level, buf.buf)) + ret = DROP_THIS_NODE; +err: +fsck_err: + if (!IS_ERR_OR_NULL(prev)) + six_unlock_read(&prev->c.lock); + if (!IS_ERR_OR_NULL(cur)) + six_unlock_read(&cur->c.lock); + + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); + + if (!ret && dropped_children) + goto again; + + printbuf_exit(&buf); + return ret; +} + +int bch2_check_topology(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree *b; + unsigned i; + int ret = 0; + + for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->alive) + continue; + + b = r->b; + if (btree_node_fake(b)) + continue; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + ret = bch2_btree_repair_topology_recurse(trans, b); + six_unlock_read(&b->c.lock); + + if (ret == DROP_THIS_NODE) { + bch_err(c, "empty btree root - repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; + } + } + + bch2_trans_put(trans); + + return ret; +} + +static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k); + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; + int ret = 0; + + /* + * XXX + * use check_bucket_ref here + */ + bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); + + if (!g->gen_valid && + (c->opts.reconstruct_alloc || + fsck_err(c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + do_update = true; + } + } + + if (gen_cmp(p.ptr.gen, g->gen) > 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } + } + + if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && + (c->opts.reconstruct_alloc || + fsck_err(c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + do_update = true; + + if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + continue; + + if (fsck_err_on(bucket_data_type(g->data_type) && + bucket_data_type(g->data_type) != data_type, c, + ptr_bucket_data_type_mismatch, + "bucket %u:%zu different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[g->data_type], + bch2_data_types[data_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->data_type = data_type; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } + } + + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + ptr_to_missing_stripe, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, + ptr_to_incorrect_stripe, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + } + } + + if (do_update) { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct bkey_i *new; + + if (is_root) { + bch_err(c, "cannot update btree roots yet"); + ret = -EINVAL; + goto err; + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!new) { + bch_err_msg(c, ret, "allocating new key"); + ret = -BCH_ERR_ENOMEM_gc_repair_key; + goto err; + } + + bkey_reassemble(new, *k); + + if (level) { + /* + * We don't want to drop btree node pointers - if the + * btree node isn't there anymore, the read path will + * sort it out: + */ + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + + ptr->gen = g->gen; + } + } else { + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); + + (ptr->cached && + (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || + (!ptr->cached && + gen_cmp(ptr->gen, g->gen) < 0) || + gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type); + })); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; +found: + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + } + + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); + goto err; + } + + if (level) + bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); + + if (0) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, *k); + bch_info(c, "updated %s", buf.buf); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + } + + *k = bkey_i_to_s_c(new); + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* marking of btree keys/nodes: */ + +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k, + bool initial) +{ + struct bch_fs *c = trans->c; + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + unsigned flags = + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); + int ret = 0; + + deleted.p = k->k->p; + + if (initial) { + BUG_ON(bch2_journal_seq_verify && + k->k->version.lo > atomic64_read(&c->journal.seq)); + + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); + if (ret) + goto err; + + if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + bkey_version_in_future, + "key version number higher than recorded: %llu > %llu", + k->k->version.lo, + atomic64_read(&c->key_version))) + atomic64_set(&c->key_version, k->k->version.lo); + } + + ret = commit_do(trans, NULL, NULL, 0, + bch2_mark_key(trans, btree_id, level, old, *k, flags)); +fsck_err: +err: + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) +{ + struct bch_fs *c = trans->c; + struct btree_node_iter iter; + struct bkey unpacked; + struct bkey_s_c k; + struct bkey_buf prev, cur; + int ret = 0; + + if (!btree_node_type_needs_gc(btree_node_type(b))) + return 0; + + bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, + &k, initial); + if (ret) + break; + + bch2_btree_node_iter_advance(&iter, b); + + if (b->c.level) { + bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_gc_check_topology(c, b, &prev, cur, + bch2_btree_node_iter_end(&iter)); + if (ret) + break; + } + } + + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + return ret; +} + +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, + bool initial, bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct btree *b; + unsigned depth = metadata_only ? 1 : 0; + int ret = 0; + + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); + + __for_each_btree_node(trans, iter, btree_id, POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + bch2_verify_btree_nr_keys(b); + + gc_pos_set(c, gc_pos_btree_node(b)); + + ret = btree_gc_mark_node(trans, b, initial); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + mutex_lock(&c->btree_root_lock); + b = bch2_btree_id_root(c, btree_id)->b; + if (!btree_node_fake(b)) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, + true, &k, initial); + } + gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); + mutex_unlock(&c->btree_root_lock); + + return ret; +} + +static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, + unsigned target_depth) +{ + struct bch_fs *c = trans->c; + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; + struct printbuf buf = PRINTBUF; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + false, &k, true); + if (ret) + goto fsck_err; + + if (b->c.level) { + bch2_bkey_buf_reassemble(&cur, c, k); + k = bkey_i_to_s_c(cur.k); + + bch2_btree_and_journal_iter_advance(&iter); + + ret = bch2_gc_check_topology(c, b, + &prev, cur, + !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) + goto fsck_err; + } else { + bch2_btree_and_journal_iter_advance(&iter); + } + } + + if (b->c.level > target_depth) { + bch2_btree_and_journal_iter_exit(&iter); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + struct btree *child; + + bch2_bkey_buf_reassemble(&cur, c, k); + bch2_btree_and_journal_iter_advance(&iter); + + child = bch2_btree_node_get_noiter(trans, cur.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(child); + + if (ret == -EIO) { + bch2_topology_error(c); + + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + btree_node_read_error, + "Unreadable btree node at btree %s level %u:\n" + " %s", + bch2_btree_id_str(b->c.btree_id), + b->c.level - 1, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && + should_restart_for_topology_repair(c)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + goto fsck_err; + } else { + /* Continue marking when opted to not + * fix the error: */ + ret = 0; + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + continue; + } + } else if (ret) { + bch_err_msg(c, ret, "getting btree node"); + break; + } + + ret = bch2_gc_btree_init_recurse(trans, child, + target_depth); + six_unlock_read(&child->c.lock); + + if (ret) + break; + } + } +fsck_err: + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_btree_init(struct btree_trans *trans, + enum btree_id btree_id, + bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct btree *b; + unsigned target_depth = metadata_only ? 1 : 0; + struct printbuf buf = PRINTBUF; + int ret = 0; + + b = bch2_btree_id_root(c, btree_id)->b; + + if (btree_node_fake(b)) + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); + if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, + btree_root_bad_min_key, + "btree root with incorrect min_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto fsck_err; + } + + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); + if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, + btree_root_bad_max_key, + "btree root with incorrect max_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto fsck_err; + } + + if (b->c.level >= target_depth) + ret = bch2_gc_btree_init_recurse(trans, b, target_depth); + + if (!ret) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, + &k, true); + } +fsck_err: + six_unlock_read(&b->c.lock); + + if (ret < 0) + bch_err_fn(c, ret); + printbuf_exit(&buf); + return ret; +} + +static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +{ + return (int) btree_id_to_gc_phase(l) - + (int) btree_id_to_gc_phase(r); +} + +static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct btree_trans *trans = bch2_trans_get(c); + enum btree_id ids[BTREE_ID_NR]; + unsigned i; + int ret = 0; + + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + + for (i = 0; i < BTREE_ID_NR && !ret; i++) + ret = initial + ? bch2_gc_btree_init(trans, ids[i], metadata_only) + : bch2_gc_btree(trans, ids[i], initial, metadata_only); + + for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { + if (!bch2_btree_id_root(c, i)->alive) + continue; + + ret = initial + ? bch2_gc_btree_init(trans, i, metadata_only) + : bch2_gc_btree(trans, i, initial, metadata_only); + } + + if (ret < 0) + bch_err_fn(c, ret); + + bch2_trans_put(trans); + return ret; +} + +static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + unsigned flags) +{ + u64 b = sector_to_bucket(ca, start); + + do { + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + + bch2_mark_metadata_bucket(c, ca, b, type, sectors, + gc_phase(GC_PHASE_SB), flags); + b++; + start += sectors; + } while (start < end); +} + +static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, + unsigned flags) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + unsigned i; + u64 b; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) + mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, + BCH_DATA_sb, flags); + + mark_metadata_sectors(c, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, flags); + } + + for (i = 0; i < ca->journal.nr; i++) { + b = ca->journal.buckets[i]; + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), flags); + } +} + +static void bch2_mark_superblocks(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + mutex_lock(&c->sb_lock); + gc_pos_set(c, gc_phase(GC_PHASE_SB)); + + for_each_online_member(ca, c, i) + bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); + mutex_unlock(&c->sb_lock); +} + +#if 0 +/* Also see bch2_pending_btree_node_free_insert_done() */ +static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) +{ + struct btree_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&c->btree_interior_update_lock); + gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); + + for_each_pending_btree_node_free(c, as, d) + if (d->index_update_done) + bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); + + mutex_unlock(&c->btree_interior_update_lock); +} +#endif + +static void bch2_gc_free(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + genradix_free(&c->reflink_gc_table); + genradix_free(&c->gc_stripes); + + for_each_member_device(ca, c, i) { + kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); + ca->buckets_gc = NULL; + + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; + } + + free_percpu(c->usage_gc); + c->usage_gc = NULL; +} + +static int bch2_gc_done(struct bch_fs *c, + bool initial, bool metadata_only) +{ + struct bch_dev *ca = NULL; + struct printbuf buf = PRINTBUF; + bool verify = !metadata_only && + !c->opts.reconstruct_alloc && + (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; + int ret = 0; + + percpu_down_write(&c->mark_lock); + +#define copy_field(_err, _f, _msg, ...) \ + if (dst->_f != src->_f && \ + (!verify || \ + fsck_err(c, _err, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f))) \ + dst->_f = src->_f +#define copy_dev_field(_err, _f, _msg, ...) \ + copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) +#define copy_fs_field(_err, _f, _msg, ...) \ + copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__) + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, + dev_usage_u64s()); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(dev_usage_buckets_wrong, + d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(dev_usage_sectors_wrong, + d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(dev_usage_fragmented_wrong, + d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + + copy_dev_field(dev_usage_buckets_ec_wrong, + buckets_ec, "buckets_ec"); + } + + { + unsigned nr = fs_usage_u64s(c); + struct bch_fs_usage *dst = c->usage_base; + struct bch_fs_usage *src = (void *) + bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); + + copy_fs_field(fs_usage_hidden_wrong, + hidden, "hidden"); + copy_fs_field(fs_usage_btree_wrong, + btree, "btree"); + + if (!metadata_only) { + copy_fs_field(fs_usage_data_wrong, + data, "data"); + copy_fs_field(fs_usage_cached_wrong, + cached, "cached"); + copy_fs_field(fs_usage_reserved_wrong, + reserved, "reserved"); + copy_fs_field(fs_usage_nr_inodes_wrong, + nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(fs_usage_persistent_reserved_wrong, + persistent_reserved[i], + "persistent_reserved[%i]", i); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + if (metadata_only && + (e->data_type == BCH_DATA_user || + e->data_type == BCH_DATA_cached)) + continue; + + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, e); + + copy_fs_field(fs_usage_replicas_wrong, + replicas[i], "%s", buf.buf); + } + } + +#undef copy_fs_field +#undef copy_dev_field +#undef copy_stripe_field +#undef copy_field +fsck_err: + if (ca) + percpu_ref_put(&ca->ref); + if (ret) + bch_err_fn(c, ret); + + percpu_up_write(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_start(struct bch_fs *c) +{ + struct bch_dev *ca = NULL; + unsigned i; + + BUG_ON(c->usage_gc); + + c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!c->usage_gc) { + bch_err(c, "error allocating c->usage_gc"); + return -BCH_ERR_ENOMEM_gc_start; + } + + for_each_member_device(ca, c, i) { + BUG_ON(ca->usage_gc); + + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); + percpu_ref_put(&ca->ref); + return -BCH_ERR_ENOMEM_gc_start; + } + + this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, + ca->mi.nbuckets - ca->mi.first_bucket); + } + + return 0; +} + +static int bch2_gc_reset(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; + } + + free_percpu(c->usage_gc); + c->usage_gc = NULL; + + return bch2_gc_start(c); +} + +/* returns true if not equal */ +static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, + struct bch_alloc_v4 r) +{ + return l.gen != r.gen || + l.oldest_gen != r.oldest_gen || + l.data_type != r.data_type || + l.dirty_sectors != r.dirty_sectors || + l.cached_sectors != r.cached_sectors || + l.stripe_redundancy != r.stripe_redundancy || + l.stripe != r.stripe; +} + +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket gc, *b; + struct bkey_i_alloc_v4 *a; + struct bch_alloc_v4 old_convert, new; + const struct bch_alloc_v4 *old; + enum bch_data_type type; + int ret; + + if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) + return 1; + + old = bch2_alloc_to_v4(k, &old_convert); + new = *old; + + percpu_down_read(&c->mark_lock); + b = gc_bucket(ca, iter->pos.offset); + + /* + * b->data_type doesn't yet include need_discard & need_gc_gen states - + * fix that here: + */ + type = __alloc_data_type(b->dirty_sectors, + b->cached_sectors, + b->stripe, + *old, + b->data_type); + if (b->data_type != type) { + struct bch_dev_usage *u; + + preempt_disable(); + u = this_cpu_ptr(ca->usage_gc); + u->d[b->data_type].buckets--; + b->data_type = type; + u->d[b->data_type].buckets++; + preempt_enable(); + } + + gc = *b; + percpu_up_read(&c->mark_lock); + + if (metadata_only && + gc.data_type != BCH_DATA_sb && + gc.data_type != BCH_DATA_journal && + gc.data_type != BCH_DATA_btree) + return 0; + + if (gen_after(old->gen, gc.gen)) + return 0; + + if (c->opts.reconstruct_alloc || + fsck_err_on(new.data_type != gc.data_type, c, + alloc_key_data_type_wrong, + "bucket %llu:%llu gen %u has wrong data_type" + ": got %s, should be %s", + iter->pos.inode, iter->pos.offset, + gc.gen, + bch2_data_types[new.data_type], + bch2_data_types[gc.data_type])) + new.data_type = gc.data_type; + +#define copy_bucket_field(_errtype, _f) \ + if (c->opts.reconstruct_alloc || \ + fsck_err_on(new._f != gc._f, c, _errtype, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ + gc.gen, \ + bch2_data_types[gc.data_type], \ + new._f, gc._f)) \ + new._f = gc._f; \ + + copy_bucket_field(alloc_key_gen_wrong, + gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, + dirty_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, + cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, + stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, + stripe_redundancy); +#undef copy_bucket_field + + if (!bch2_alloc_v4_cmp(*old, new)) + return 0; + + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + a->v = new; + + /* + * The trigger normally makes sure this is set, but we're not running + * triggers: + */ + if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) + a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); +fsck_err: + return ret; +} + +static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + unsigned i; + int ret = 0; + + for_each_member_device(ca, c, i) { + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(trans, &iter, k, metadata_only)); + + if (ret < 0) { + bch_err_fn(c, ret); + percpu_ref_put(&ca->ref); + break; + } + } + + bch2_trans_put(trans); + return ret < 0 ? ret : 0; +} + +static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) +{ + struct bch_dev *ca; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bucket *g; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + unsigned i; + int ret; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!buckets) { + percpu_ref_put(&ca->ref); + bch_err(c, "error allocating ca->buckets[gc]"); + ret = -BCH_ERR_ENOMEM_gc_alloc_start; + goto err; + } + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; + rcu_assign_pointer(ca->buckets_gc, buckets); + } + + ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = gc_bucket(ca, k.k->p.offset); + + a = bch2_alloc_to_v4(k, &a_convert); + + g->gen_valid = 1; + g->gen = a->gen; + + if (metadata_only && + (a->data_type == BCH_DATA_user || + a->data_type == BCH_DATA_cached || + a->data_type == BCH_DATA_parity)) { + g->data_type = a->data_type; + g->dirty_sectors = a->dirty_sectors; + g->cached_sectors = a->cached_sectors; + g->stripe = a->stripe; + g->stripe_redundancy = a->stripe_redundancy; + } + + 0; + })); +err: + bch2_trans_put(trans); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = gc_bucket_array(ca); + struct bucket *g; + + for_each_bucket(g, buckets) { + if (metadata_only && + (g->data_type == BCH_DATA_user || + g->data_type == BCH_DATA_cached || + g->data_type == BCH_DATA_parity)) + continue; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } + } +} + +static int bch2_gc_write_reflink_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + size_t *idx) +{ + struct bch_fs *c = trans->c; + const __le64 *refcount = bkey_refcount_c(k); + struct printbuf buf = PRINTBUF; + struct reflink_gc *r; + int ret = 0; + + if (!refcount) + return 0; + + while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && + r->offset < k.k->p.offset) + ++*idx; + + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + return -EINVAL; + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, + reflink_v_refcount_wrong, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); + + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; + else + *bkey_refcount(new) = cpu_to_le64(r->refcount); + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + size_t idx = 0; + int ret = 0; + + if (metadata_only) + return 0; + + trans = bch2_trans_get(c); + + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_gc_write_reflink_key(trans, &iter, k, &idx)); + + c->reflink_gc_nr = 0; + bch2_trans_put(trans); + return ret; +} + +static int bch2_gc_reflink_start(struct bch_fs *c, + bool metadata_only) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; + int ret = 0; + + if (metadata_only) + return 0; + + trans = bch2_trans_get(c); + c->reflink_gc_nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + } + bch2_trans_iter_exit(trans, &iter); + + bch2_trans_put(trans); + return ret; +} + +static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) +{ + struct genradix_iter iter; + struct reflink_gc *r; + + genradix_for_each(&c->reflink_gc_table, iter, r) + r->refcount = 0; +} + +static int bch2_gc_write_stripes_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + const struct bch_stripe *s; + struct gc_stripe *m; + bool bad = false; + unsigned i; + int ret = 0; + + if (k.k->type != KEY_TYPE_stripe) + return 0; + + s = bkey_s_c_to_stripe(k).v; + m = genradix_ptr(&c->gc_stripes, k.k->p.offset); + + for (i = 0; i < s->nr_blocks; i++) { + u32 old = stripe_blockcount_get(s, i); + u32 new = (m ? m->block_sectors[i] : 0); + + if (old != new) { + prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", + i, old, new); + bad = true; + } + } + + if (bad) + bch2_bkey_val_to_text(&buf, c, k); + + if (fsck_err_on(bad, c, stripe_sector_count_wrong, + "%s", buf.buf)) { + struct bkey_i_stripe *new; + + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_reassemble(&new->k_i, k); + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + + ret = bch2_trans_update(trans, iter, &new->k_i, 0); + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + if (metadata_only) + return 0; + + trans = bch2_trans_get(c); + + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_gc_write_stripes_key(trans, &iter, k)); + + bch2_trans_put(trans); + return ret; +} + +static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) +{ + genradix_free(&c->gc_stripes); +} + +/** + * bch2_gc - walk _all_ references to buckets, and recompute them: + * + * @c: filesystem object + * @initial: are we in recovery? + * @metadata_only: are we just checking metadata references, or everything? + * + * Returns: 0 on success, or standard errcode on failure + * + * Order matters here: + * - Concurrent GC relies on the fact that we have a total ordering for + * everything that GC walks - see gc_will_visit_node(), + * gc_will_visit_root() + * + * - also, references move around in the course of index updates and + * various other crap: everything needs to agree on the ordering + * references are allowed to move around in - e.g., we're allowed to + * start with a reference owned by an open_bucket (the allocator) and + * move it to the btree, but not the reverse. + * + * This is necessary to ensure that gc doesn't miss references that + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them + */ +int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) +{ + unsigned iter = 0; + int ret; + + lockdep_assert_held(&c->state_lock); + + down_write(&c->gc_lock); + + bch2_btree_interior_updates_flush(c); + + ret = bch2_gc_start(c) ?: + bch2_gc_alloc_start(c, metadata_only) ?: + bch2_gc_reflink_start(c, metadata_only); + if (ret) + goto out; +again: + gc_pos_set(c, gc_phase(GC_PHASE_START)); + + bch2_mark_superblocks(c); + + ret = bch2_gc_btrees(c, initial, metadata_only); + + if (ret) + goto out; + +#if 0 + bch2_mark_pending_btree_node_frees(c); +#endif + c->gc_count++; + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + (!iter && bch2_test_restart_gc)) { + if (iter++ > 2) { + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + goto out; + } + + /* + * XXX: make sure gens we fixed got saved + */ + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + + bch2_gc_stripes_reset(c, metadata_only); + bch2_gc_alloc_reset(c, metadata_only); + bch2_gc_reflink_reset(c, metadata_only); + ret = bch2_gc_reset(c); + if (ret) + goto out; + + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); + goto again; + } +out: + if (!ret) { + bch2_journal_block(&c->journal); + + ret = bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only) ?: + bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); + } + + percpu_down_write(&c->mark_lock); + /* Indicates that gc is no longer in progress: */ + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + + bch2_gc_free(c); + percpu_up_write(&c->mark_lock); + + up_write(&c->gc_lock); + + /* + * At startup, allocations can happen directly instead of via the + * allocator thread - issue wakeup in case they blocked on gc_lock: + */ + closure_wake_up(&c->freelist_wait); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int gc_btree_gens_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + struct bkey_i *u; + int ret; + + percpu_down_read(&c->mark_lock); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ptr_stale(ca, ptr) > 16) { + percpu_up_read(&c->mark_lock); + goto update; + } + } + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; + } + percpu_up_read(&c->mark_lock); + return 0; +update: + u = bch2_bkey_make_mut(trans, iter, &k, 0); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bch2_extent_normalize(c, bkey_i_to_s(u)); + return 0; +} + +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + struct bkey_i_alloc_v4 *a_mut; + int ret; + + if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) + return 0; + + a_mut = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + return ret; + + a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; + a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); + + return bch2_trans_update(trans, iter, &a_mut->k_i, 0); +} + +int bch2_gc_gens(struct bch_fs *c) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + u64 b, start_time = local_clock(); + unsigned i; + int ret; + + /* + * Ideally we would be using state_lock and not gc_lock here, but that + * introduces a deadlock in the RO path - we currently take the state + * lock at the start of going RO, thus the gc thread may get stuck: + */ + if (!mutex_trylock(&c->gc_gens_lock)) + return 0; + + trace_and_count(c, gc_gens_start, c); + down_read(&c->gc_lock); + trans = bch2_trans_get(c); + + for_each_member_device(ca, c, i) { + struct bucket_gens *gens = bucket_gens(ca); + + BUG_ON(ca->oldest_gen); + + ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); + if (!ca->oldest_gen) { + percpu_ref_put(&ca->ref); + ret = -BCH_ERR_ENOMEM_gc_gens; + goto err; + } + + for (b = gens->first_bucket; + b < gens->nbuckets; b++) + ca->oldest_gen[b] = gens->b[b]; + } + + for (i = 0; i < BTREE_ID_NR; i++) + if (btree_type_has_ptrs(i)) { + c->gc_gens_btree = i; + c->gc_gens_pos = POS_MIN; + + ret = for_each_btree_key_commit(trans, iter, i, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_NOFAIL, + gc_btree_gens_key(trans, &iter, k)); + if (ret && !bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); + if (ret) + goto err; + } + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + POS_MIN, + BTREE_ITER_PREFETCH, + k, + NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_alloc_write_oldest_gen(trans, &iter, k)); + if (ret && !bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); + if (ret) + goto err; + + c->gc_gens_btree = 0; + c->gc_gens_pos = POS_MIN; + + c->gc_count++; + + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + trace_and_count(c, gc_gens_end, c); +err: + for_each_member_device(ca, c, i) { + kvfree(ca->oldest_gen); + ca->oldest_gen = NULL; + } + + bch2_trans_put(trans); + up_read(&c->gc_lock); + mutex_unlock(&c->gc_gens_lock); + return ret; +} + +static int bch2_gc_thread(void *arg) +{ + struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; + unsigned long last = atomic64_read(&clock->now); + unsigned last_kick = atomic_read(&c->kick_gc); + int ret; + + set_freezable(); + + while (1) { + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + + if (atomic_read(&c->kick_gc) != last_kick) + break; + + if (c->btree_gc_periodic) { + unsigned long next = last + c->capacity / 16; + + if (atomic64_read(&clock->now) >= next) + break; + + bch2_io_clock_schedule_timeout(clock, next); + } else { + schedule(); + } + + try_to_freeze(); + } + __set_current_state(TASK_RUNNING); + + last = atomic64_read(&clock->now); + last_kick = atomic_read(&c->kick_gc); + + /* + * Full gc is currently incompatible with btree key cache: + */ +#if 0 + ret = bch2_gc(c, false, false); +#else + ret = bch2_gc_gens(c); +#endif + if (ret < 0) + bch_err_fn(c, ret); + + debug_check_no_locks_held(); + } + + return 0; +} + +void bch2_gc_thread_stop(struct bch_fs *c) +{ + struct task_struct *p; + + p = c->gc_thread; + c->gc_thread = NULL; + + if (p) { + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_gc_thread_start(struct bch_fs *c) +{ + struct task_struct *p; + + if (c->gc_thread) + return 0; + + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); + if (IS_ERR(p)) { + bch_err_fn(c, PTR_ERR(p)); + return PTR_ERR(p); + } + + get_task_struct(p); + c->gc_thread = p; + wake_up_process(p); + return 0; +} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h new file mode 100644 index 000000000000..607575f83a00 --- /dev/null +++ b/fs/bcachefs/btree_gc.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_H +#define _BCACHEFS_BTREE_GC_H + +#include "bkey.h" +#include "btree_types.h" + +int bch2_check_topology(struct bch_fs *); +int bch2_gc(struct bch_fs *, bool, bool); +int bch2_gc_gens(struct bch_fs *); +void bch2_gc_thread_stop(struct bch_fs *); +int bch2_gc_thread_start(struct bch_fs *); + +/* + * For concurrent mark and sweep (with other index updates), we define a total + * ordering of _all_ references GC walks: + * + * Note that some references will have the same GC position as others - e.g. + * everything within the same btree node; in those cases we're relying on + * whatever locking exists for where those references live, i.e. the write lock + * on a btree node. + * + * That locking is also required to ensure GC doesn't pass the updater in + * between the updater adding/removing the reference and updating the GC marks; + * without that, we would at best double count sometimes. + * + * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ + * be held that prevents GC from passing the position the updater is at. + * + * (What about the start of gc, when we're clearing all the marks? GC clears the + * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc + * position inside its cmpxchg loop, so crap magically works). + */ + +/* Position of (the start of) a gc phase: */ +static inline struct gc_pos gc_phase(enum gc_phase phase) +{ + return (struct gc_pos) { + .phase = phase, + .pos = POS_MIN, + .level = 0, + }; +} + +static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) +{ + return cmp_int(l.phase, r.phase) ?: + bpos_cmp(l.pos, r.pos) ?: + cmp_int(l.level, r.level); +} + +static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) +{ + switch (id) { +#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; + BCH_BTREE_IDS() +#undef x + default: + BUG(); + } +} + +static inline struct gc_pos gc_pos_btree(enum btree_id id, + struct bpos pos, unsigned level) +{ + return (struct gc_pos) { + .phase = btree_id_to_gc_phase(id), + .pos = pos, + .level = level, + }; +} + +/* + * GC position of the pointers within a btree node: note, _not_ for &b->key + * itself, that lives in the parent node: + */ +static inline struct gc_pos gc_pos_btree_node(struct btree *b) +{ + return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); +} + +/* + * GC position of the pointer to a btree root: we don't use + * gc_pos_pointer_to_btree_node() here to avoid a potential race with + * btree_split() increasing the tree depth - the new root will have level > the + * old root and thus have a greater gc position than the old root, but that + * would be incorrect since once gc has marked the root it's not coming back. + */ +static inline struct gc_pos gc_pos_btree_root(enum btree_id id) +{ + return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); +} + +static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) +{ + unsigned seq; + bool ret; + + do { + seq = read_seqcount_begin(&c->gc_pos_lock); + ret = gc_pos_cmp(pos, c->gc_pos) <= 0; + } while (read_seqcount_retry(&c->gc_pos_lock, seq)); + + return ret; +} + +static inline void bch2_do_gc_gens(struct bch_fs *c) +{ + atomic_inc(&c->kick_gc); + if (c->gc_thread) + wake_up_process(c->gc_thread); +} + +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 index 000000000000..5a720f0cd5a6 --- /dev/null +++ b/fs/bcachefs/btree_io.c @@ -0,0 +1,2297 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "bkey_sort.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "checksum.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "io_write.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "recovery.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/sched/mm.h> + +void bch2_btree_node_io_unlock(struct btree *b) +{ + EBUG_ON(!btree_node_write_in_flight(b)); + + clear_btree_node_write_in_flight_inner(b); + clear_btree_node_write_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +void bch2_btree_node_io_lock(struct btree *b) +{ + bch2_assert_btree_nodes_not_locked(); + + wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_read(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_write(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_read(struct btree *b) +{ + bch2_assert_btree_nodes_not_locked(); + + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_write(struct btree *b) +{ + bch2_assert_btree_nodes_not_locked(); + + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +static void verify_no_dups(struct btree *b, + struct bkey_packed *start, + struct bkey_packed *end) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bkey_packed *k, *p; + + if (start == end) + return; + + for (p = start, k = bkey_p_next(start); + k != end; + p = k, k = bkey_p_next(k)) { + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); + + BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); + } +#endif +} + +static void set_needs_whiteout(struct bset *i, int v) +{ + struct bkey_packed *k; + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) + k->needs_whiteout = v; +} + +static void btree_bounce_free(struct bch_fs *c, size_t size, + bool used_mempool, void *p) +{ + if (used_mempool) + mempool_free(p, &c->btree_bounce_pool); + else + vpfree(p, size); +} + +static void *btree_bounce_alloc(struct bch_fs *c, size_t size, + bool *used_mempool) +{ + unsigned flags = memalloc_nofs_save(); + void *p; + + BUG_ON(size > btree_bytes(c)); + + *used_mempool = false; + p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + if (!p) { + *used_mempool = true; + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + } + memalloc_nofs_restore(flags); + return p; +} + +static void sort_bkey_ptrs(const struct btree *bt, + struct bkey_packed **ptrs, unsigned nr) +{ + unsigned n = nr, a = nr / 2, b, c, d; + + if (!a) + return; + + /* Heap sort: see lib/sort.c: */ + while (1) { + if (a) + a--; + else if (--n) + swap(ptrs[0], ptrs[n]); + else + break; + + for (b = a; c = 2 * b + 1, (d = c + 1) < n;) + b = bch2_bkey_cmp_packed(bt, + ptrs[c], + ptrs[d]) >= 0 ? c : d; + if (d == n) + b = c; + + while (b != a && + bch2_bkey_cmp_packed(bt, + ptrs[a], + ptrs[b]) >= 0) + b = (b - 1) / 2; + c = b; + while (b != a) { + b = (b - 1) / 2; + swap(ptrs[b], ptrs[c]); + } + } +} + +static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) +{ + struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; + bool used_mempool = false; + size_t bytes = b->whiteout_u64s * sizeof(u64); + + if (!b->whiteout_u64s) + return; + + new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); + + ptrs = ptrs_end = ((void *) new_whiteouts + bytes); + + for (k = unwritten_whiteouts_start(c, b); + k != unwritten_whiteouts_end(c, b); + k = bkey_p_next(k)) + *--ptrs = k; + + sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); + + k = new_whiteouts; + + while (ptrs != ptrs_end) { + bkey_p_copy(k, *ptrs); + k = bkey_p_next(k); + ptrs++; + } + + verify_no_dups(b, new_whiteouts, + (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); + + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); + + btree_bounce_free(c, bytes, used_mempool, new_whiteouts); +} + +static bool should_compact_bset(struct btree *b, struct bset_tree *t, + bool compacting, enum compact_mode mode) +{ + if (!bset_dead_u64s(b, t)) + return false; + + switch (mode) { + case COMPACT_LAZY: + return should_compact_bset_lazy(b, t) || + (compacting && !bset_written(b, bset(b, t))); + case COMPACT_ALL: + return true; + default: + BUG(); + } +} + +static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) +{ + struct bset_tree *t; + bool ret = false; + + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k, *n, *out, *start, *end; + struct btree_node_entry *src = NULL, *dst = NULL; + + if (t != b->set && !bset_written(b, i)) { + src = container_of(i, struct btree_node_entry, keys); + dst = max(write_block(b), + (void *) btree_bkey_last(b, t - 1)); + } + + if (src != dst) + ret = true; + + if (!should_compact_bset(b, t, ret, mode)) { + if (src != dst) { + memmove(dst, src, sizeof(*src) + + le16_to_cpu(src->keys.u64s) * + sizeof(u64)); + i = &dst->keys; + set_btree_bset(b, t, i); + } + continue; + } + + start = btree_bkey_first(b, t); + end = btree_bkey_last(b, t); + + if (src != dst) { + memmove(dst, src, sizeof(*src)); + i = &dst->keys; + set_btree_bset(b, t, i); + } + + out = i->start; + + for (k = start; k != end; k = n) { + n = bkey_p_next(k); + + if (!bkey_deleted(k)) { + bkey_p_copy(out, k); + out = bkey_p_next(out); + } else { + BUG_ON(k->needs_whiteout); + } + } + + i->u64s = cpu_to_le16((u64 *) out - i->_data); + set_btree_bset_end(b, t); + bch2_bset_set_no_aux_tree(b, t); + ret = true; + } + + bch2_verify_btree_nr_keys(b); + + bch2_btree_build_aux_trees(b); + + return ret; +} + +bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + enum compact_mode mode) +{ + return bch2_drop_whiteouts(b, mode); +} + +static void btree_node_sort(struct bch_fs *c, struct btree *b, + unsigned start_idx, + unsigned end_idx, + bool filter_whiteouts) +{ + struct btree_node *out; + struct sort_iter_stack sort_iter; + struct bset_tree *t; + struct bset *start_bset = bset(b, &b->set[start_idx]); + bool used_mempool = false; + u64 start_time, seq = 0; + unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; + bool sorting_entire_node = start_idx == 0 && + end_idx == b->nsets; + + sort_iter_stack_init(&sort_iter, b); + + for (t = b->set + start_idx; + t < b->set + end_idx; + t++) { + u64s += le16_to_cpu(bset(b, t)->u64s); + sort_iter_add(&sort_iter.iter, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + } + + bytes = sorting_entire_node + ? btree_bytes(c) + : __vstruct_bytes(struct btree_node, u64s); + + out = btree_bounce_alloc(c, bytes, &used_mempool); + + start_time = local_clock(); + + u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts); + + out->keys.u64s = cpu_to_le16(u64s); + + BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); + + if (sorting_entire_node) + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); + + /* Make sure we preserve bset journal_seq: */ + for (t = b->set + start_idx; t < b->set + end_idx; t++) + seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); + start_bset->journal_seq = cpu_to_le64(seq); + + if (sorting_entire_node) { + u64s = le16_to_cpu(out->keys.u64s); + + BUG_ON(bytes != btree_bytes(c)); + + /* + * Our temporary buffer is the same size as the btree node's + * buffer, we can just swap buffers instead of doing a big + * memcpy() + */ + *out = *b->data; + out->keys.u64s = cpu_to_le16(u64s); + swap(out, b->data); + set_btree_bset(b, b->set, &b->data->keys); + } else { + start_bset->u64s = out->keys.u64s; + memcpy_u64s(start_bset->start, + out->keys.start, + le16_to_cpu(out->keys.u64s)); + } + + for (i = start_idx + 1; i < end_idx; i++) + b->nr.bset_u64s[start_idx] += + b->nr.bset_u64s[i]; + + b->nsets -= shift; + + for (i = start_idx + 1; i < b->nsets; i++) { + b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; + b->set[i] = b->set[i + shift]; + } + + for (i = b->nsets; i < MAX_BSETS; i++) + b->nr.bset_u64s[i] = 0; + + set_btree_bset_end(b, &b->set[start_idx]); + bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); + + btree_bounce_free(c, bytes, used_mempool, out); + + bch2_verify_btree_nr_keys(b); +} + +void bch2_btree_sort_into(struct bch_fs *c, + struct btree *dst, + struct btree *src) +{ + struct btree_nr_keys nr; + struct btree_node_iter src_iter; + u64 start_time = local_clock(); + + BUG_ON(dst->nsets != 1); + + bch2_bset_set_no_aux_tree(dst, dst->set); + + bch2_btree_node_iter_init_from_start(&src_iter, src); + + nr = bch2_sort_repack(btree_bset_first(dst), + src, &src_iter, + &dst->format, + true); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); + + set_btree_bset_end(dst, dst->set); + + dst->nr.live_u64s += nr.live_u64s; + dst->nr.bset_u64s[0] += nr.bset_u64s[0]; + dst->nr.packed_keys += nr.packed_keys; + dst->nr.unpacked_keys += nr.unpacked_keys; + + bch2_verify_btree_nr_keys(dst); +} + +/* + * We're about to add another bset to the btree node, so if there's currently + * too many bsets - sort some of them together: + */ +static bool btree_node_compact(struct bch_fs *c, struct btree *b) +{ + unsigned unwritten_idx; + bool ret = false; + + for (unwritten_idx = 0; + unwritten_idx < b->nsets; + unwritten_idx++) + if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) + break; + + if (b->nsets - unwritten_idx > 1) { + btree_node_sort(c, b, unwritten_idx, + b->nsets, false); + ret = true; + } + + if (unwritten_idx > 1) { + btree_node_sort(c, b, 0, unwritten_idx, false); + ret = true; + } + + return ret; +} + +void bch2_btree_build_aux_trees(struct btree *b) +{ + struct bset_tree *t; + + for_each_bset(b, t) + bch2_bset_build_aux_tree(b, t, + !bset_written(b, bset(b, t)) && + t == bset_tree_last(b)); +} + +/* + * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? + * + * The first bset is going to be of similar order to the size of the node, the + * last bset is bounded by btree_write_set_buffer(), which is set to keep the + * memmove on insert from being too expensive: the middle bset should, ideally, + * be the geometric mean of the first and the last. + * + * Returns true if the middle bset is greater than that geometric mean: + */ +static inline bool should_compact_all(struct bch_fs *c, struct btree *b) +{ + unsigned mid_u64s_bits = + (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; + + return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; +} + +/* + * @bch_btree_init_next - initialize a new (unwritten) bset that can then be + * inserted into + * + * Safe to call if there already is an unwritten bset - will only add a new bset + * if @b doesn't already have one. + * + * Returns true if we sorted (i.e. invalidated iterators + */ +void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct btree_node_entry *bne; + bool reinit_iter = false; + + EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); + BUG_ON(bset_written(b, bset(b, &b->set[1]))); + BUG_ON(btree_node_just_written(b)); + + if (b->nsets == MAX_BSETS && + !btree_node_write_in_flight(b) && + should_compact_all(c, b)) { + bch2_btree_node_write(c, b, SIX_LOCK_write, + BTREE_WRITE_init_next_bset); + reinit_iter = true; + } + + if (b->nsets == MAX_BSETS && + btree_node_compact(c, b)) + reinit_iter = true; + + BUG_ON(b->nsets >= MAX_BSETS); + + bne = want_new_bset(c, b); + if (bne) + bch2_bset_init_next(c, b, bne); + + bch2_btree_build_aux_trees(b); + + if (reinit_iter) + bch2_trans_node_reinit_iter(trans, b); +} + +static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned offset, int write) +{ + prt_printf(out, bch2_log_msg(c, "%s"), + write == READ + ? "error validating btree node " + : "corrupt btree node before write "); + if (ca) + prt_printf(out, "on %s ", ca->name); + prt_printf(out, "at btree "); + bch2_btree_pos_to_text(out, c, b); + + prt_printf(out, "\n node offset %u", b->written); + if (i) + prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + prt_str(out, ": "); +} + +__printf(9, 10) +static int __btree_err(int ret, + struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, + struct bset *i, + int write, + bool have_retry, + enum bch_sb_error_id err_type, + const char *fmt, ...) +{ + struct printbuf out = PRINTBUF; + va_list args; + + btree_err_msg(&out, c, ca, b, i, b->written, write); + + va_start(args, fmt); + prt_vprintf(&out, fmt, args); + va_end(args); + + if (write == WRITE) { + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = c->opts.errors == BCH_ON_ERROR_continue + ? 0 + : -BCH_ERR_fsck_errors_not_fixed; + goto out; + } + + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) + ret = -BCH_ERR_btree_node_read_err_fixable; + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) + ret = -BCH_ERR_btree_node_read_err_bad_node; + + if (ret != -BCH_ERR_btree_node_read_err_fixable) + bch2_sb_error_count(c, err_type); + + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: + ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf); + if (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore) + goto fsck_err; + ret = -BCH_ERR_fsck_fix; + break; + case -BCH_ERR_btree_node_read_err_want_retry: + case -BCH_ERR_btree_node_read_err_must_retry: + bch2_print_string_as_lines(KERN_ERR, out.buf); + break; + case -BCH_ERR_btree_node_read_err_bad_node: + bch2_print_string_as_lines(KERN_ERR, out.buf); + bch2_topology_error(c); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + break; + case -BCH_ERR_btree_node_read_err_incompatible: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = -BCH_ERR_fsck_errors_not_fixed; + break; + default: + BUG(); + } +out: +fsck_err: + printbuf_exit(&out); + return ret; +} + +#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ +({ \ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + BCH_FSCK_ERR_##_err_type, \ + msg, ##__VA_ARGS__); \ + \ + if (_ret != -BCH_ERR_fsck_fix) { \ + ret = _ret; \ + goto fsck_err; \ + } \ + \ + *saw_error = true; \ +}) + +#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) + +/* + * When btree topology repair changes the start or end of a node, that might + * mean we have to drop keys that are no longer inside the node: + */ +__cold +void bch2_btree_node_drop_keys_outside_node(struct btree *b) +{ + struct bset_tree *t; + + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k; + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) + if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) + break; + + if (k != i->start) { + unsigned shift = (u64 *) k - (u64 *) i->start; + + memmove_u64s_down(i->start, k, + (u64 *) vstruct_end(i) - (u64 *) k); + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); + set_btree_bset_end(b, t); + } + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) + if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) + break; + + if (k != vstruct_last(i)) { + i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); + set_btree_bset_end(b, t); + } + } + + /* + * Always rebuild search trees: eytzinger search tree nodes directly + * depend on the values of min/max key: + */ + bch2_bset_set_no_aux_tree(b, b->set); + bch2_btree_build_aux_trees(b); + + struct bkey_s_c k; + struct bkey unpacked; + struct btree_node_iter iter; + for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + } +} + +static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned offset, unsigned sectors, + int write, bool have_retry, bool *saw_error) +{ + unsigned version = le16_to_cpu(i->version); + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret = 0; + + btree_err_on(!bch2_version_compatible(version), + -BCH_ERR_btree_node_read_err_incompatible, + c, ca, b, i, + btree_node_unsupported_version, + "unsupported bset version %u.%u", + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version)); + + if (btree_err_on(version < c->sb.version_min, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bset_older_than_sb_min, + "bset version %u older than superblock version_min %u", + version, c->sb.version_min)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + if (btree_err_on(BCH_VERSION_MAJOR(version) > + BCH_VERSION_MAJOR(c->sb.version), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bset_newer_than_sb, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), + -BCH_ERR_btree_node_read_err_incompatible, + c, ca, b, i, + btree_node_unsupported_version, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + + if (btree_err_on(offset + sectors > btree_sectors(c), + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_past_end_of_btree_node, + "bset past end of btree node")) { + i->u64s = 0; + ret = 0; + goto out; + } + + btree_err_on(offset && !i->u64s, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_empty, + "empty bset"); + + btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_wrong_sector_offset, + "bset at wrong sector offset"); + + if (!offset) { + struct btree_node *bn = + container_of(i, struct btree_node, keys); + /* These indicate that we read the wrong btree node: */ + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + /* XXX endianness */ + btree_err_on(bp->seq != bn->keys.seq, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + bset_bad_seq, + "incorrect sequence number (wrong btree node)"); + } + + btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_btree, + "incorrect btree id"); + + btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_level, + "incorrect level"); + + if (!write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + if (BTREE_PTR_RANGE_UPDATED(bp)) { + b->data->min_key = bp->min_key; + b->data->max_key = b->key.k.p; + } + + btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_min_key, + "incorrect min_key: got %s should be %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), + (printbuf_reset(&buf2), + bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); + } + + btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, i, + btree_node_bad_max_key, + "incorrect max key %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); + + if (write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), + -BCH_ERR_btree_node_read_err_bad_node, + c, ca, b, i, + btree_node_bad_format, + "invalid bkey format: %s\n %s", buf1.buf, + (printbuf_reset(&buf2), + bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); + printbuf_reset(&buf1); + + compat_bformat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &bn->format); + } +out: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static int bset_key_invalid(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, int rw, + struct printbuf *err) +{ + return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?: + (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); +} + +static int validate_bset_keys(struct bch_fs *c, struct btree *b, + struct bset *i, int write, + bool have_retry, bool *saw_error) +{ + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; + struct printbuf buf = PRINTBUF; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; + + for (k = i->start; + k != vstruct_last(i);) { + struct bkey_s u; + struct bkey tmp; + + if (btree_err_on(bkey_p_next(k) > vstruct_last(i), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_past_bset_end, + "key extends past end of bset")) { + i->u64s = cpu_to_le16((u64 *) k - i->_data); + break; + } + + if (btree_err_on(k->format > KEY_FORMAT_CURRENT, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_bad_format, + "invalid bkey format %u", k->format)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + + /* XXX: validate k->u64s */ + if (!write) + bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + + u = __bkey_disassemble(b, k, &tmp); + + printbuf_reset(&buf); + if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { + printbuf_reset(&buf); + bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + + btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bad_bkey, + "invalid bkey: %s", buf.buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + + if (write) + bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + + if (prev && bkey_iter_cmp(b, prev, k) > 0) { + struct bkey up = bkey_unpack_key(b, prev); + + printbuf_reset(&buf); + prt_printf(&buf, "keys out of order: "); + bch2_bkey_to_text(&buf, &up); + prt_printf(&buf, " > "); + bch2_bkey_to_text(&buf, u.k); + + bch2_dump_bset(c, b, i, 0); + + if (btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_out_of_order, + "%s", buf.buf)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + } + + prev = k; + k = bkey_p_next(k); + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, bool have_retry, bool *saw_error) +{ + struct btree_node_entry *bne; + struct sort_iter *iter; + struct btree_node *sorted; + struct bkey_packed *k; + struct bch_extent_ptr *ptr; + struct bset *i; + bool used_mempool, blacklisted; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + unsigned u64s; + unsigned ptr_written = btree_ptr_sectors_written(&b->key); + struct printbuf buf = PRINTBUF; + int ret = 0, retry_read = 0, write = READ; + + b->version_ondisk = U16_MAX; + /* We might get called multiple times on read retry: */ + b->written = 0; + + iter = mempool_alloc(&c->fill_iter, GFP_NOFS); + sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2); + + if (bch2_meta_read_fault("btree")) + btree_err(-BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_fault_injected, + "dynamic fault"); + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_magic, + "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(b->data->magic)); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(b->data->keys.seq != bp->seq, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_seq, + "got wrong btree node (seq %llx want %llx)", + b->data->keys.seq, bp->seq); + } else { + btree_err_on(!b->data->keys.seq, + -BCH_ERR_btree_node_read_err_must_retry, + c, ca, b, NULL, + btree_node_bad_seq, + "bad btree header: seq 0"); + } + + while (b->written < (ptr_written ?: btree_sectors(c))) { + unsigned sectors; + struct nonce nonce; + bool first = !b->written; + bool csum_bad; + + if (!b->written) { + i = &b->data->keys; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); + + csum_bad = bch2_crc_cmp(b->data->csum, + csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data)); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_bad_csum, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i", ret)) + goto fsck_err; + + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), + -BCH_ERR_btree_node_read_err_incompatible, + c, NULL, b, NULL, + btree_node_unsupported_version, + "btree node does not have NEW_EXTENT_OVERWRITE set"); + + sectors = vstruct_sectors(b->data, c->block_bits); + } else { + bne = write_block(b); + i = &bne->keys; + + if (i->seq != b->data->keys.seq) + break; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); + csum_bad = bch2_crc_cmp(bne->csum, + csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne)); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, + bset_bad_csum, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i\n", ret)) + goto fsck_err; + + sectors = vstruct_sectors(bne, c->block_bits); + } + + b->version_ondisk = min(b->version_ondisk, + le16_to_cpu(i->version)); + + ret = validate_bset(c, ca, b, i, b->written, sectors, + READ, have_retry, saw_error); + if (ret) + goto fsck_err; + + if (!b->written) + btree_node_set_format(b, b->data->format); + + ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); + if (ret) + goto fsck_err; + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); + + btree_err_on(blacklisted && first, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + bset_blacklisted_journal_seq, + "first btree node bset has blacklisted journal seq (%llu)", + le64_to_cpu(i->journal_seq)); + + btree_err_on(blacklisted && ptr_written, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, + first_bset_blacklisted_journal_seq, + "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", + le64_to_cpu(i->journal_seq), + b->written, b->written + sectors, ptr_written); + + b->written += sectors; + + if (blacklisted && !first) + continue; + + sort_iter_add(iter, + vstruct_idx(i, 0), + vstruct_last(i)); + } + + if (ptr_written) { + btree_err_on(b->written < ptr_written, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, NULL, + btree_node_data_missing, + "btree node data missing: expected %u sectors, found %u", + ptr_written, b->written); + } else { + for (bne = write_block(b); + bset_byte_offset(b, bne) < btree_bytes(c); + bne = (void *) bne + block_bytes(c)) + btree_err_on(bne->keys.seq == b->data->keys.seq && + !bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), + true), + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, NULL, + btree_node_bset_after_end, + "found bset signature after last bset"); + } + + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted->keys.u64s = 0; + + set_btree_bset(b, b->set, &b->data->keys); + + b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + + u64s = le16_to_cpu(sorted->keys.u64s); + *sorted = *b->data; + sorted->keys.u64s = cpu_to_le16(u64s); + swap(sorted, b->data); + set_btree_bset(b, b->set, &b->data->keys); + b->nsets = 1; + + BUG_ON(b->nr.live_u64s != u64s); + + btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + + if (updated_range) + bch2_btree_node_drop_keys_outside_node(b); + + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + + printbuf_reset(&buf); + + if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { + printbuf_reset(&buf); + + prt_printf(&buf, "invalid bkey: "); + bch2_bkey_val_invalid(c, u.s_c, READ, &buf); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + + btree_err(-BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bad_bkey, + "%s", buf.buf); + + btree_keys_account_key_drop(&b->nr, 0, k); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); + continue; + } + + if (u.k->type == KEY_TYPE_btree_ptr_v2) { + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); + + bp.v->mem_ptr = 0; + } + + k = bkey_p_next(k); + } + + bch2_bset_build_aux_tree(b, b->set, false); + + set_needs_whiteout(btree_bset_first(b), true); + + btree_node_reset_sib_u64s(b); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev); + + if (ca2->mi.state != BCH_MEMBER_STATE_rw) + set_btree_node_need_rewrite(b); + } + + if (!ptr_written) + set_btree_node_need_rewrite(b); +out: + mempool_free(iter, &c->fill_iter); + printbuf_exit(&buf); + return retry_read; +fsck_err: + if (ret == -BCH_ERR_btree_node_read_err_want_retry || + ret == -BCH_ERR_btree_node_read_err_must_retry) + retry_read = 1; + else + set_btree_node_read_error(b); + goto out; +} + +static void btree_node_read_work(struct work_struct *work) +{ + struct btree_read_bio *rb = + container_of(work, struct btree_read_bio, work); + struct bch_fs *c = rb->c; + struct btree *b = rb->b; + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; + struct printbuf buf = PRINTBUF; + bool saw_error = false; + bool retry = false; + bool can_retry; + + goto start; + while (1) { + retry = true; + bch_info(c, "retrying read"); + ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = rb->pick.ptr.offset; + bio->bi_iter.bi_size = btree_bytes(c); + + if (rb->have_ioref) { + bio_set_dev(bio, ca->disk_sb.bdev); + submit_bio_wait(bio); + } else { + bio->bi_status = BLK_STS_REMOVED; + } +start: + printbuf_reset(&buf); + bch2_btree_pos_to_text(&buf, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf.buf); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; + + bch2_mark_io_failure(&failed, &rb->pick); + + can_retry = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick) > 0; + + if (!bio->bi_status && + !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { + if (retry) + bch_info(c, "retry success"); + break; + } + + saw_error = true; + + if (!can_retry) { + set_btree_node_read_error(b); + break; + } + } + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); + + if (saw_error && !btree_node_read_error(b)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->key.k.p); + bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", + __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); + + bch2_btree_node_rewrite_async(c, b); + } + + printbuf_exit(&buf); + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} + +static void btree_node_read_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + + bch2_latency_acct(ca, rb->start_time, READ); + } + + queue_work(c->io_complete_wq, &rb->work); +} + +struct btree_node_read_all { + struct closure cl; + struct bch_fs *c; + struct btree *b; + unsigned nr; + void *buf[BCH_REPLICAS_MAX]; + struct bio *bio[BCH_REPLICAS_MAX]; + blk_status_t err[BCH_REPLICAS_MAX]; +}; + +static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + unsigned offset = 0; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return 0; + + while (offset < btree_sectors(c)) { + if (!offset) { + offset += vstruct_sectors(bn, c->block_bits); + } else { + bne = data + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + offset += vstruct_sectors(bne, c->block_bits); + } + } + + return offset; +} + +static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + + if (!offset) + return false; + + while (offset < btree_sectors(c)) { + bne = data + (offset << 9); + if (bne->keys.seq == bn->keys.seq) + return true; + offset++; + } + + return false; + return offset; +} + +static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) +{ + closure_type(ra, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; + struct printbuf buf = PRINTBUF; + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, best = -1, write = READ; + unsigned i, written = 0, written2 = 0; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + bool _saw_error = false, *saw_error = &_saw_error; + + for (i = 0; i < ra->nr; i++) { + struct btree_node *bn = ra->buf[i]; + + if (ra->err[i]) + continue; + + if (le64_to_cpu(bn->magic) != bset_magic(c) || + (seq && seq != bn->keys.seq)) + continue; + + if (best < 0) { + best = i; + written = btree_node_sectors_written(c, bn); + continue; + } + + written2 = btree_node_sectors_written(c, ra->buf[i]); + if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_replicas_sectors_written_mismatch, + "btree node sectors written mismatch: %u != %u", + written, written2) || + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_bset_after_end, + "found bset signature after last bset") || + btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, NULL, + btree_node_replicas_data_mismatch, + "btree node replicas content mismatch")) + dump_bset_maps = true; + + if (written2 > written) { + written = written2; + best = i; + } + } +fsck_err: + if (dump_bset_maps) { + for (i = 0; i < ra->nr; i++) { + struct btree_node *bn = ra->buf[i]; + struct btree_node_entry *bne = NULL; + unsigned offset = 0, sectors; + bool gap = false; + + if (ra->err[i]) + continue; + + printbuf_reset(&buf); + + while (offset < btree_sectors(c)) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); + } else { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + sectors = vstruct_sectors(bne, c->block_bits); + } + + prt_printf(&buf, " %u-%u", offset, offset + sectors); + if (bne && bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + prt_printf(&buf, "*"); + offset += sectors; + } + + while (offset < btree_sectors(c)) { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) + prt_printf(&buf, " GAP"); + gap = true; + + sectors = vstruct_sectors(bne, c->block_bits); + prt_printf(&buf, " %u-%u", offset, offset + sectors); + if (bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + prt_printf(&buf, "*"); + } + offset++; + } + + bch_err(c, "replica %u:%s", i, buf.buf); + } + } + + if (best >= 0) { + memcpy(b->data, ra->buf[best], btree_bytes(c)); + ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); + } else { + ret = -1; + } + + if (ret) + set_btree_node_read_error(b); + else if (*saw_error) + bch2_btree_node_rewrite_async(c, b); + + for (i = 0; i < ra->nr; i++) { + mempool_free(ra->buf[i], &c->btree_bounce_pool); + bio_put(ra->bio[i]); + } + + closure_debug_destroy(&ra->cl); + kfree(ra); + printbuf_exit(&buf); + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} + +static void btree_node_read_all_replicas_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + struct btree_node_read_all *ra = rb->ra; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + + bch2_latency_acct(ca, rb->start_time, READ); + } + + ra->err[rb->idx] = bio->bi_status; + closure_put(&ra->cl); +} + +/* + * XXX This allocates multiple times from the same mempools, and can deadlock + * under sufficient memory pressure (but is only a debug path) + */ +static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) +{ + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded pick; + struct btree_node_read_all *ra; + unsigned i; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; + + closure_init(&ra->cl, NULL); + ra->c = c; + ra->b = b; + ra->nr = bch2_bkey_nr_ptrs(k); + + for (i = 0; i < ra->nr; i++) { + ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + ra->bio[i] = bio_alloc_bioset(NULL, + buf_pages(ra->buf[i], btree_bytes(c)), + REQ_OP_READ|REQ_SYNC|REQ_META, + GFP_NOFS, + &c->btree_bio); + } + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct btree_read_bio *rb = + container_of(ra->bio[i], struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = ra; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->idx = i; + rb->pick = pick; + rb->bio.bi_iter.bi_sector = pick.ptr.offset; + rb->bio.bi_end_io = btree_node_read_all_replicas_endio; + bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(&rb->bio)); + bio_set_dev(&rb->bio, ca->disk_sb.bdev); + + closure_get(&ra->cl); + submit_bio(&rb->bio); + } else { + ra->err[i] = BLK_STS_REMOVED; + } + + i++; + } + + if (sync) { + closure_sync(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl.work); + } else { + continue_at(&ra->cl, btree_node_read_all_replicas_done, + c->io_complete_wq); + } + + return 0; +} + +void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + bool sync) +{ + struct extent_ptr_decoded pick; + struct btree_read_bio *rb; + struct bch_dev *ca; + struct bio *bio; + int ret; + + trace_and_count(c, btree_node_read, c, b); + + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) + return; + + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); + + if (ret <= 0) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "btree node read error: no device to read from\n at "); + bch2_btree_pos_to_text(&buf, c, b); + bch_err(c, "%s", buf.buf); + + if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) + bch2_fatal_error(c); + + set_btree_node_read_error(b); + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); + printbuf_exit(&buf); + return; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + bio = bio_alloc_bioset(NULL, + buf_pages(b->data, btree_bytes(c)), + REQ_OP_READ|REQ_SYNC|REQ_META, + GFP_NOFS, + &c->btree_bio); + rb = container_of(bio, struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = NULL; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->pick = pick; + INIT_WORK(&rb->work, btree_node_read_work); + bio->bi_iter.bi_sector = pick.ptr.offset; + bio->bi_end_io = btree_node_read_endio; + bch2_bio_map(bio, b->data, btree_bytes(c)); + + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(bio)); + bio_set_dev(bio, ca->disk_sb.bdev); + + if (sync) { + submit_bio_wait(bio); + + btree_node_read_work(&rb->work); + } else { + submit_bio(bio); + } + } else { + bio->bi_status = BLK_STS_REMOVED; + + if (sync) + btree_node_read_work(&rb->work); + else + queue_work(c->io_complete_wq, &rb->work); + } +} + +static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, + const struct bkey_i *k, unsigned level) +{ + struct bch_fs *c = trans->c; + struct closure cl; + struct btree *b; + int ret; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + + b = bch2_btree_node_mem_alloc(trans, level != 0); + bch2_btree_cache_cannibalize_unlock(c); + + BUG_ON(IS_ERR(b)); + + bkey_copy(&b->key, k); + BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); + + set_btree_node_read_in_flight(b); + + bch2_btree_node_read(c, b, true); + + if (btree_node_read_error(b)) { + bch2_btree_node_hash_remove(&c->btree_cache, b); + + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + ret = -EIO; + goto err; + } + + bch2_btree_set_root_for_read(c, b); +err: + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + + return ret; +} + +int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + const struct bkey_i *k, unsigned level) +{ + return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); +} + +static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + struct btree_write *w) +{ + unsigned long old, new, v = READ_ONCE(b->will_make_reachable); + + do { + old = new = v; + if (!(old & 1)) + break; + + new &= ~1UL; + } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); + + if (old & 1) + closure_put(&((struct btree_update *) new)->cl); + + bch2_journal_pin_drop(&c->journal, &w->journal); +} + +static void __btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + struct btree_write *w = btree_prev_write(b); + unsigned long old, new, v; + unsigned type = 0; + + bch2_btree_complete_write(c, b, w); + + v = READ_ONCE(b->flags); + do { + old = new = v; + + if ((old & (1U << BTREE_NODE_dirty)) && + (old & (1U << BTREE_NODE_need_write)) && + !(old & (1U << BTREE_NODE_never_write)) && + !(old & (1U << BTREE_NODE_write_blocked)) && + !(old & (1U << BTREE_NODE_will_make_reachable))) { + new &= ~(1U << BTREE_NODE_dirty); + new &= ~(1U << BTREE_NODE_need_write); + new |= (1U << BTREE_NODE_write_in_flight); + new |= (1U << BTREE_NODE_write_in_flight_inner); + new |= (1U << BTREE_NODE_just_written); + new ^= (1U << BTREE_NODE_write_idx); + + type = new & BTREE_WRITE_TYPE_MASK; + new &= ~BTREE_WRITE_TYPE_MASK; + } else { + new &= ~(1U << BTREE_NODE_write_in_flight); + new &= ~(1U << BTREE_NODE_write_in_flight_inner); + } + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + if (new & (1U << BTREE_NODE_write_in_flight)) + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); + else + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +static void btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + struct btree_trans *trans = bch2_trans_get(c); + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); + + bch2_trans_put(trans); +} + +static void btree_node_write_work(struct work_struct *work) +{ + struct btree_write_bio *wbio = + container_of(work, struct btree_write_bio, work); + struct bch_fs *c = wbio->wbio.c; + struct btree *b = wbio->wbio.bio.bi_private; + struct bch_extent_ptr *ptr; + int ret = 0; + + btree_bounce_free(c, + wbio->data_bytes, + wbio->wbio.used_mempool, + wbio->data); + + bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) + goto err; + + if (wbio->wbio.first_btree_write) { + if (wbio->wbio.failed.nr) { + + } + } else { + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, + BCH_WATERMARK_reclaim| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW, + !wbio->wbio.failed.nr)); + if (ret) + goto err; + } +out: + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b); + return; +err: + set_btree_node_noevict(b); + if (!bch2_err_matches(ret, EROFS)) + bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret)); + goto out; +} + +static void btree_node_write_endio(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_write_bio *orig = parent ?: wbio; + struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); + struct bch_fs *c = wbio->c; + struct btree *b = wbio->bio.bi_private; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + unsigned long flags; + + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time, WRITE); + + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + "btree write error: %s", + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bch2_dev_list_add_dev(&orig->failed, wbio->dev); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + } + + if (wbio->have_ioref) + percpu_ref_put(&ca->io_ref); + + if (parent) { + bio_put(bio); + bio_endio(&parent->bio); + return; + } + + clear_btree_node_write_in_flight_inner(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); + INIT_WORK(&wb->work, btree_node_write_work); + queue_work(c->btree_io_complete_wq, &wb->work); +} + +static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned sectors) +{ + struct printbuf buf = PRINTBUF; + bool saw_error; + int ret; + + ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), + BKEY_TYPE_btree, WRITE, &buf); + + if (ret) + bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); + printbuf_exit(&buf); + if (ret) + return ret; + + ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); + if (ret) { + bch2_inconsistent_error(c); + dump_stack(); + } + + return ret; +} + +static void btree_write_submit(struct work_struct *work) +{ + struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); + struct bch_extent_ptr *ptr; + BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + + bkey_copy(&tmp.k, &wbio->key); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) + ptr->offset += wbio->sector_offset; + + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, + &tmp.k, false); +} + +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) +{ + struct btree_write_bio *wbio; + struct bset_tree *t; + struct bset *i; + struct btree_node *bn = NULL; + struct btree_node_entry *bne = NULL; + struct sort_iter_stack sort_iter; + struct nonce nonce; + unsigned bytes_to_write, sectors_to_write, bytes, u64s; + u64 seq = 0; + bool used_mempool; + unsigned long old, new; + bool validate_before_checksum = false; + enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; + void *data; + int ret; + + if (flags & BTREE_WRITE_ALREADY_STARTED) + goto do_write; + + /* + * We may only have a read lock on the btree node - the dirty bit is our + * "lock" against racing with other threads that may be trying to start + * a write, we do a write iff we clear the dirty bit. Since setting the + * dirty bit requires a write lock, we can't race with other threads + * redirtying it: + */ + do { + old = new = READ_ONCE(b->flags); + + if (!(old & (1 << BTREE_NODE_dirty))) + return; + + if ((flags & BTREE_WRITE_ONLY_IF_NEED) && + !(old & (1 << BTREE_NODE_need_write))) + return; + + if (old & + ((1 << BTREE_NODE_never_write)| + (1 << BTREE_NODE_write_blocked))) + return; + + if (b->written && + (old & (1 << BTREE_NODE_will_make_reachable))) + return; + + if (old & (1 << BTREE_NODE_write_in_flight)) + return; + + if (flags & BTREE_WRITE_ONLY_IF_NEED) + type = new & BTREE_WRITE_TYPE_MASK; + new &= ~BTREE_WRITE_TYPE_MASK; + + new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); + new |= (1 << BTREE_NODE_write_in_flight); + new |= (1 << BTREE_NODE_write_in_flight_inner); + new |= (1 << BTREE_NODE_just_written); + new ^= (1 << BTREE_NODE_write_idx); + } while (cmpxchg_acquire(&b->flags, old, new) != old); + + if (new & (1U << BTREE_NODE_need_write)) + return; +do_write: + BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); + + atomic_dec(&c->btree_cache.dirty); + + BUG_ON(btree_node_fake(b)); + BUG_ON((b->will_make_reachable != 0) != !b->written); + + BUG_ON(b->written >= btree_sectors(c)); + BUG_ON(b->written & (block_sectors(c) - 1)); + BUG_ON(bset_written(b, btree_bset_last(b))); + BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); + BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); + + bch2_sort_whiteouts(c, b); + + sort_iter_stack_init(&sort_iter, b); + + bytes = !b->written + ? sizeof(struct btree_node) + : sizeof(struct btree_node_entry); + + bytes += b->whiteout_u64s * sizeof(u64); + + for_each_bset(b, t) { + i = bset(b, t); + + if (bset_written(b, i)) + continue; + + bytes += le16_to_cpu(i->u64s) * sizeof(u64); + sort_iter_add(&sort_iter.iter, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + seq = max(seq, le64_to_cpu(i->journal_seq)); + } + + BUG_ON(b->written && !seq); + + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + + /* buffer must be a multiple of the block size */ + bytes = round_up(bytes, block_bytes(c)); + + data = btree_bounce_alloc(c, bytes, &used_mempool); + + if (!b->written) { + bn = data; + *bn = *b->data; + i = &bn->keys; + } else { + bne = data; + bne->keys = b->data->keys; + i = &bne->keys; + } + + i->journal_seq = cpu_to_le64(seq); + i->u64s = 0; + + sort_iter_add(&sort_iter.iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + SET_BSET_SEPARATE_WHITEOUTS(i, false); + + b->whiteout_u64s = 0; + + u64s = bch2_sort_keys(i->start, &sort_iter.iter, false); + le16_add_cpu(&i->u64s, u64s); + + BUG_ON(!b->written && i->u64s != b->data->keys.u64s); + + set_needs_whiteout(i, false); + + /* do we have data to write? */ + if (b->written && !i->u64s) + goto nowrite; + + bytes_to_write = vstruct_end(i) - data; + sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + + if (!b->written && + b->key.k.type == KEY_TYPE_btree_ptr_v2) + BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); + + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + + BUG_ON(b->written + sectors_to_write > btree_sectors(c)); + BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); + BUG_ON(i->seq != b->data->keys.seq); + + i->version = cpu_to_le16(c->sb.version); + SET_BSET_OFFSET(i, b->written); + SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); + + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) + validate_before_checksum = true; + + /* validate_bset will be modifying: */ + if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) + validate_before_checksum = true; + + /* if we're going to be encrypting, check metadata validity first: */ + if (validate_before_checksum && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error encrypting btree node: %i\n", ret)) + goto err; + + nonce = btree_nonce(i, b->written << 9); + + if (bn) + bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); + else + bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + /* if we're not encrypting, check metadata after checksumming: */ + if (!validate_before_checksum && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; + + /* + * We handle btree write errors by immediately halting the journal - + * after we've done that, we can't issue any subsequent btree writes + * because they might have pointers to new nodes that failed to write. + * + * Furthermore, there's no point in doing any more btree writes because + * with the journal stopped, we're never going to update the journal to + * reflect that those writes were done and the data flushed from the + * journal: + * + * Also on journal error, the pending write may have updates that were + * never journalled (interior nodes, see btree_update_nodes_written()) - + * it's critical that we don't do the write in that case otherwise we + * will have updates visible that weren't in the journal: + * + * Make sure to update b->written so bch2_btree_init_next() doesn't + * break: + */ + if (bch2_journal_error(&c->journal) || + c->opts.nochanges) + goto err; + + trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); + + wbio = container_of(bio_alloc_bioset(NULL, + buf_pages(data, sectors_to_write << 9), + REQ_OP_WRITE|REQ_META, + GFP_NOFS, + &c->btree_bio), + struct btree_write_bio, wbio.bio); + wbio_init(&wbio->wbio.bio); + wbio->data = data; + wbio->data_bytes = bytes; + wbio->sector_offset = b->written; + wbio->wbio.c = c; + wbio->wbio.used_mempool = used_mempool; + wbio->wbio.first_btree_write = !b->written; + wbio->wbio.bio.bi_end_io = btree_node_write_endio; + wbio->wbio.bio.bi_private = b; + + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); + + bkey_copy(&wbio->key, &b->key); + + b->written += sectors_to_write; + + if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = + cpu_to_le16(b->written); + + atomic64_inc(&c->btree_write_stats[type].nr); + atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + + INIT_WORK(&wbio->work, btree_write_submit); + queue_work(c->io_complete_wq, &wbio->work); + return; +err: + set_btree_node_noevict(b); + b->written += sectors_to_write; +nowrite: + btree_bounce_free(c, bytes, used_mempool, data); + __btree_node_write_done(c, b); +} + +/* + * Work that must be done with write lock held: + */ +bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) +{ + bool invalidated_iter = false; + struct btree_node_entry *bne; + struct bset_tree *t; + + if (!btree_node_just_written(b)) + return false; + + BUG_ON(b->whiteout_u64s); + + clear_btree_node_just_written(b); + + /* + * Note: immediately after write, bset_written() doesn't work - the + * amount of data we had to write after compaction might have been + * smaller than the offset of the last bset. + * + * However, we know that all bsets have been written here, as long as + * we're still holding the write lock: + */ + + /* + * XXX: decide if we really want to unconditionally sort down to a + * single bset: + */ + if (b->nsets > 1) { + btree_node_sort(c, b, 0, b->nsets, true); + invalidated_iter = true; + } else { + invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); + } + + for_each_bset(b, t) + set_needs_whiteout(bset(b, t), true); + + bch2_btree_verify(c, b); + + /* + * If later we don't unconditionally sort down to a single bset, we have + * to ensure this is still true: + */ + BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); + + bne = want_new_bset(c, b); + if (bne) + bch2_bset_init_next(c, b, bne); + + bch2_btree_build_aux_trees(b); + + return invalidated_iter; +} + +/* + * Use this one if the node is intent locked: + */ +void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_type_held, + unsigned flags) +{ + if (lock_type_held == SIX_LOCK_intent || + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { + __bch2_btree_node_write(c, b, flags); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && + six_trylock_write(&b->c.lock)) { + bch2_btree_post_write_cleanup(c, b); + six_unlock_write(&b->c.lock); + } + + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { + __bch2_btree_node_write(c, b, flags); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); + } +} + +static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + unsigned i; + bool ret = false; +restart: + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (test_bit(flag, &b->flags)) { + rcu_read_unlock(); + wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + ret = true; + goto restart; + } + rcu_read_unlock(); + + return ret; +} + +bool bch2_btree_flush_all_reads(struct bch_fs *c) +{ + return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); +} + +bool bch2_btree_flush_all_writes(struct bch_fs *c) +{ + return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); +} + +static const char * const bch2_btree_write_types[] = { +#define x(t, n) [n] = #t, + BCH_BTREE_WRITE_TYPES() + NULL +}; + +void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + printbuf_tabstop_push(out, 20); + printbuf_tabstop_push(out, 10); + + prt_tab(out); + prt_str(out, "nr"); + prt_tab(out); + prt_str(out, "size"); + prt_newline(out); + + for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { + u64 nr = atomic64_read(&c->btree_write_stats[i].nr); + u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); + + prt_printf(out, "%s:", bch2_btree_write_types[i]); + prt_tab(out); + prt_u64(out, nr); + prt_tab(out); + prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); + prt_newline(out); + } +} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 index 000000000000..e0d7fa5b1dfb --- /dev/null +++ b/fs/bcachefs/btree_io.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_IO_H +#define _BCACHEFS_BTREE_IO_H + +#include "bkey_methods.h" +#include "bset.h" +#include "btree_locking.h" +#include "checksum.h" +#include "extents.h" +#include "io_write_types.h" + +struct bch_fs; +struct btree_write; +struct btree; +struct btree_iter; +struct btree_node_read_all; + +static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) +{ + if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) + atomic_inc(&c->btree_cache.dirty); +} + +static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) +{ + if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) + atomic_dec(&c->btree_cache.dirty); +} + +static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) +{ + return k->k.type == KEY_TYPE_btree_ptr_v2 + ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) + : 0; +} + +struct btree_read_bio { + struct bch_fs *c; + struct btree *b; + struct btree_node_read_all *ra; + u64 start_time; + unsigned have_ioref:1; + unsigned idx:7; + struct extent_ptr_decoded pick; + struct work_struct work; + struct bio bio; +}; + +struct btree_write_bio { + struct work_struct work; + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + void *data; + unsigned data_bytes; + unsigned sector_offset; + struct bch_write_bio wbio; +}; + +void bch2_btree_node_io_unlock(struct btree *); +void bch2_btree_node_io_lock(struct btree *); +void __bch2_btree_node_wait_on_read(struct btree *); +void __bch2_btree_node_wait_on_write(struct btree *); +void bch2_btree_node_wait_on_read(struct btree *); +void bch2_btree_node_wait_on_write(struct btree *); + +enum compact_mode { + COMPACT_LAZY, + COMPACT_ALL, +}; + +bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, + enum compact_mode); + +static inline bool should_compact_bset_lazy(struct btree *b, + struct bset_tree *t) +{ + unsigned total_u64s = bset_u64s(t); + unsigned dead_u64s = bset_dead_u64s(b, t); + + return dead_u64s > 64 && dead_u64s * 3 > total_u64s; +} + +static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) +{ + struct bset_tree *t; + + for_each_bset(b, t) + if (should_compact_bset_lazy(b, t)) + return bch2_compact_whiteouts(c, b, COMPACT_LAZY); + + return false; +} + +static inline struct nonce btree_nonce(struct bset *i, unsigned offset) +{ + return (struct nonce) {{ + [0] = cpu_to_le32(offset), + [1] = ((__le32 *) &i->seq)[0], + [2] = ((__le32 *) &i->seq)[1], + [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, + }}; +} + +static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +{ + struct nonce nonce = btree_nonce(i, offset); + int ret; + + if (!offset) { + struct btree_node *bn = container_of(i, struct btree_node, keys); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &bn->flags, bytes); + if (ret) + return ret; + + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); + } + + return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); +} + +void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); + +void bch2_btree_node_drop_keys_outside_node(struct btree *); + +void bch2_btree_build_aux_trees(struct btree *); +void bch2_btree_init_next(struct btree_trans *, struct btree *); + +int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, + struct btree *, bool, bool *); +void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); +int bch2_btree_root_read(struct bch_fs *, enum btree_id, + const struct bkey_i *, unsigned); + +bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + +enum btree_write_flags { + __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, + __BTREE_WRITE_ALREADY_STARTED, +}; +#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) +#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) + +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); +void bch2_btree_node_write(struct bch_fs *, struct btree *, + enum six_lock_type, unsigned); + +static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_held) +{ + bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); +} + +bool bch2_btree_flush_all_reads(struct bch_fs *); +bool bch2_btree_flush_all_writes(struct bch_fs *); + +static inline void compat_bformat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, struct bkey_format *f) +{ + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_inodes) { + swap(f->bits_per_field[BKEY_FIELD_INODE], + f->bits_per_field[BKEY_FIELD_OFFSET]); + swap(f->field_offset[BKEY_FIELD_INODE], + f->field_offset[BKEY_FIELD_OFFSET]); + } + + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + u64 max_packed = + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + f->field_offset[BKEY_FIELD_SNAPSHOT] = write + ? 0 + : cpu_to_le64(U32_MAX - max_packed); + } +} + +static inline void compat_bpos(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, struct bpos *p) +{ + if (big_endian != CPU_BIG_ENDIAN) + bch2_bpos_swab(p); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_inodes) + swap(p->inode, p->offset); +} + +static inline void compat_btree_node(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct btree_node *bn) +{ + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && + write) + bn->min_key = bpos_nosnap_predecessor(bn->min_key); + + if (version < bcachefs_metadata_version_snapshot && + write) + bn->max_key.snapshot = 0; + + compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); + compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); + + if (version < bcachefs_metadata_version_snapshot && + !write) + bn->max_key.snapshot = U32_MAX; + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && + !write) + bn->min_key = bpos_nosnap_successor(bn->min_key); +} + +void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 index 000000000000..6be79129738d --- /dev/null +++ b/fs/bcachefs/btree_iter.c @@ -0,0 +1,3248 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "bkey_buf.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "journal.h" +#include "replicas.h" +#include "snapshot.h" +#include "trace.h" + +#include <linux/random.h> +#include <linux/prefetch.h> + +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, + struct btree_path *); + +static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) +{ +#ifdef TRACK_PATH_ALLOCATED + return iter->ip_allocated; +#else + return 0; +#endif +} + +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + +static inline int __btree_path_cmp(const struct btree_path *l, + enum btree_id r_btree_id, + bool r_cached, + struct bpos r_pos, + unsigned r_level) +{ + /* + * Must match lock ordering as defined by __bch2_btree_node_lock: + */ + return cmp_int(l->btree_id, r_btree_id) ?: + cmp_int((int) l->cached, (int) r_cached) ?: + bpos_cmp(l->pos, r_pos) ?: + -cmp_int(l->level, r_level); +} + +static inline int btree_path_cmp(const struct btree_path *l, + const struct btree_path *r) +{ + return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); +} + +static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +{ + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_successor(p); + } else { + p = bpos_nosnap_successor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + +static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) +{ + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_predecessor(p); + } else { + p = bpos_nosnap_predecessor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + +static inline struct bpos btree_iter_search_key(struct btree_iter *iter) +{ + struct bpos pos = iter->pos; + + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + !bkey_eq(pos, POS_MAX)) + pos = bkey_successor(iter, pos); + return pos; +} + +static inline bool btree_path_pos_before_node(struct btree_path *path, + struct btree *b) +{ + return bpos_lt(path->pos, b->data->min_key); +} + +static inline bool btree_path_pos_after_node(struct btree_path *path, + struct btree *b) +{ + return bpos_gt(path->pos, b->key.k.p); +} + +static inline bool btree_path_pos_in_node(struct btree_path *path, + struct btree *b) +{ + return path->btree_id == b->c.btree_id && + !btree_path_pos_before_node(path, b) && + !btree_path_pos_after_node(path, b); +} + +/* Btree iterator: */ + +#ifdef CONFIG_BCACHEFS_DEBUG + +static void bch2_btree_path_verify_cached(struct btree_trans *trans, + struct btree_path *path) +{ + struct bkey_cached *ck; + bool locked = btree_node_locked(path, 0); + + if (!bch2_btree_node_relock(trans, path, 0)) + return; + + ck = (void *) path->l[0].b; + BUG_ON(ck->key.btree_id != path->btree_id || + !bkey_eq(ck->key.pos, path->pos)); + + if (!locked) + btree_node_unlock(trans, path, 0); +} + +static void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + struct btree_path_level *l; + struct btree_node_iter tmp; + bool locked; + struct bkey_packed *p, *k; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct printbuf buf3 = PRINTBUF; + const char *msg; + + if (!bch2_debug_check_iterators) + return; + + l = &path->l[level]; + tmp = l->iter; + locked = btree_node_locked(path, level); + + if (path->cached) { + if (!level) + bch2_btree_path_verify_cached(trans, path); + return; + } + + if (!btree_path_node(path, level)) + return; + + if (!bch2_btree_node_relock_notrace(trans, path, level)) + return; + + BUG_ON(!btree_path_pos_in_node(path, l->b)); + + bch2_btree_node_iter_verify(&l->iter, l->b); + + /* + * For interior nodes, the iterator will have skipped past deleted keys: + */ + p = level + ? bch2_btree_node_iter_prev(&tmp, l->b) + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + + if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { + msg = "before"; + goto err; + } + + if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { + msg = "after"; + goto err; + } + + if (!locked) + btree_node_unlock(trans, path, level); + return; +err: + bch2_bpos_to_text(&buf1, path->pos); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); + + bch2_bkey_to_text(&buf2, &uk); + } else { + prt_printf(&buf2, "(none)"); + } + + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); + + bch2_bkey_to_text(&buf3, &uk); + } else { + prt_printf(&buf3, "(none)"); + } + + panic("path should be %s key at level %u:\n" + "path pos %s\n" + "prev key %s\n" + "cur key %s\n", + msg, level, buf1.buf, buf2.buf, buf3.buf); +} + +static void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + unsigned i; + + EBUG_ON(path->btree_id >= BTREE_ID_NR); + + for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + if (!path->l[i].b) { + BUG_ON(!path->cached && + bch2_btree_id_root(c, path->btree_id)->b->c.level > i); + break; + } + + bch2_btree_path_verify_level(trans, path, i); + } + + bch2_btree_path_verify_locks(path); +} + +void bch2_trans_verify_paths(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify(trans, path); +} + +static void bch2_btree_iter_verify(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + BUG_ON(iter->btree_id >= BTREE_ID_NR); + + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshot_field(iter->btree_id)); + + if (iter->update_path) + bch2_btree_path_verify(trans, iter->update_path); + bch2_btree_path_verify(trans, iter->path); +} + +static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !iter->pos.snapshot); + + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + + BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || + bkey_gt(iter->pos, iter->k.p)); +} + +static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +{ + struct btree_trans *trans = iter->trans; + struct btree_iter copy; + struct bkey_s_c prev; + int ret = 0; + + if (!bch2_debug_check_iterators) + return 0; + + if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + return 0; + + if (bkey_err(k) || !k.k) + return 0; + + BUG_ON(!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)); + + bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, + BTREE_ITER_NOPRESERVE| + BTREE_ITER_ALL_SNAPSHOTS); + prev = bch2_btree_iter_prev(©); + if (!prev.k) + goto out; + + ret = bkey_err(prev); + if (ret) + goto out; + + if (bkey_eq(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, + prev.k->p.snapshot) > 0) { + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + bch2_bkey_to_text(&buf1, k.k); + bch2_bkey_to_text(&buf2, prev.k); + + panic("iter snap %u\n" + "k %s\n" + "prev %s\n", + iter->snapshot, + buf1.buf, buf2.buf); + } +out: + bch2_trans_iter_exit(trans, ©); + return ret; +} + +void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) +{ + struct btree_path *path; + unsigned idx; + struct printbuf buf = PRINTBUF; + + btree_trans_sort_paths(trans); + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: + cmp_int(path->cached, key_cache); + + if (cmp > 0) + break; + if (cmp < 0) + continue; + + if (!btree_node_locked(path, 0) || + !path->should_be_locked) + continue; + + if (!key_cache) { + if (bkey_ge(pos, path->l[0].b->data->min_key) && + bkey_le(pos, path->l[0].b->key.k.p)) + return; + } else { + if (bkey_eq(pos, path->pos)) + return; + } + } + + bch2_dump_trans_paths_updates(trans); + bch2_bpos_to_text(&buf, pos); + + panic("not locked: %s %s%s\n", + bch2_btree_id_str(id), buf.buf, + key_cache ? " cached" : ""); +} + +#else + +static inline void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned l) {} +static inline void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) {} +static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} +static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } + +#endif + +/* Btree path: fixups after btree updates */ + +static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t, + struct bkey_packed *k) +{ + struct btree_node_iter_set *set; + + btree_node_iter_for_each(iter, set) + if (set->end == t->end_offset) { + set->k = __btree_node_key_to_offset(b, k); + bch2_btree_node_iter_sort(iter, b); + return; + } + + bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); +} + +static void __bch2_btree_path_fix_key_modified(struct btree_path *path, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_path_level *l = &path->l[b->c.level]; + + if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) + return; + + if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) + bch2_btree_node_iter_advance(&l->iter, l->b); +} + +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_path *path; + + trans_for_each_path_with_node(trans, b, path) { + __bch2_btree_path_fix_key_modified(path, b, where); + bch2_btree_path_verify_level(trans, path, b->c.level); + } +} + +static void __bch2_btree_node_iter_fix(struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bset_tree *t, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + const struct bkey_packed *end = btree_bkey_last(b, t); + struct btree_node_iter_set *set; + unsigned offset = __btree_node_key_to_offset(b, where); + int shift = new_u64s - clobber_u64s; + unsigned old_end = t->end_offset - shift; + unsigned orig_iter_pos = node_iter->data[0].k; + bool iter_current_key_modified = + orig_iter_pos >= offset && + orig_iter_pos <= offset + clobber_u64s; + + btree_node_iter_for_each(node_iter, set) + if (set->end == old_end) + goto found; + + /* didn't find the bset in the iterator - might have to readd it: */ + if (new_u64s && + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { + bch2_btree_node_iter_push(node_iter, b, where, end); + goto fixup_done; + } else { + /* Iterator is after key that changed */ + return; + } +found: + set->end = t->end_offset; + + /* Iterator hasn't gotten to the key that changed yet: */ + if (set->k < offset) + return; + + if (new_u64s && + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { + set->k = offset; + } else if (set->k < offset + clobber_u64s) { + set->k = offset + new_u64s; + if (set->k == set->end) + bch2_btree_node_iter_set_drop(node_iter, set); + } else { + /* Iterator is after key that changed */ + set->k = (int) set->k + shift; + return; + } + + bch2_btree_node_iter_sort(node_iter, b); +fixup_done: + if (node_iter->data[0].k != orig_iter_pos) + iter_current_key_modified = true; + + /* + * When a new key is added, and the node iterator now points to that + * key, the iterator might have skipped past deleted keys that should + * come after the key the iterator now points to. We have to rewind to + * before those deleted keys - otherwise + * bch2_btree_node_iter_prev_all() breaks: + */ + if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && + b->c.level) { + struct bkey_packed *k, *k2, *p; + + k = bch2_btree_node_iter_peek_all(node_iter, b); + + for_each_bset(b, t) { + bool set_pos = false; + + if (node_iter->data[0].end == t->end_offset) + continue; + + k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); + + while ((p = bch2_bkey_prev_all(b, t, k2)) && + bkey_iter_cmp(b, k, p) < 0) { + k2 = p; + set_pos = true; + } + + if (set_pos) + btree_node_iter_set_set_pos(node_iter, + b, t, k2); + } + } +} + +void bch2_btree_node_iter_fix(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); + struct btree_path *linked; + + if (node_iter != &path->l[b->c.level].iter) { + __bch2_btree_node_iter_fix(path, b, node_iter, t, + where, clobber_u64s, new_u64s); + + if (bch2_debug_check_iterators) + bch2_btree_node_iter_verify(node_iter, b); + } + + trans_for_each_path_with_node(trans, b, linked) { + __bch2_btree_node_iter_fix(linked, b, + &linked->l[b->c.level].iter, t, + where, clobber_u64s, new_u64s); + bch2_btree_path_verify_level(trans, linked, b->c.level); + } +} + +/* Btree path level: pointer to a particular btree node and node iter */ + +static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, + struct btree_path_level *l, + struct bkey *u, + struct bkey_packed *k) +{ + if (unlikely(!k)) { + /* + * signal to bch2_btree_iter_peek_slot() that we're currently at + * a hole + */ + u->type = KEY_TYPE_deleted; + return bkey_s_c_null; + } + + return bkey_disassemble(l->b, k, u); +} + +static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, + struct btree_path_level *l, + struct bkey *u) +{ + return __btree_iter_unpack(c, l, u, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); +} + +static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) +{ + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->key.k.p; + trans->paths_sorted = false; + bch2_btree_path_verify_level(trans, path, l - path->l); + return k; +} + +static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) +{ + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + bch2_btree_node_iter_prev(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->data->min_key; + trans->paths_sorted = false; + bch2_btree_path_verify_level(trans, path, l - path->l); + return k; +} + +static inline bool btree_path_advance_to_pos(struct btree_path *path, + struct btree_path_level *l, + int max_advance) +{ + struct bkey_packed *k; + int nr_advanced = 0; + + while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && + bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { + if (max_advance > 0 && nr_advanced >= max_advance) + return false; + + bch2_btree_node_iter_advance(&l->iter, l->b); + nr_advanced++; + } + + return true; +} + +static inline void __btree_path_level_init(struct btree_path *path, + unsigned level) +{ + struct btree_path_level *l = &path->l[level]; + + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (level) + bch2_btree_node_iter_peek(&l->iter, l->b); +} + +void bch2_btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + BUG_ON(path->cached); + + EBUG_ON(!btree_path_pos_in_node(path, b)); + + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); + path->l[b->c.level].b = b; + __btree_path_level_init(path, b->c.level); +} + +/* Btree path: fixups after btree node updates: */ + +static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!i->cached && + i->level == b->c.level && + i->btree_id == b->c.btree_id && + bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && + bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { + i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, + i->k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } +} + +/* + * A btree node is being replaced - update the iterator to point to the new + * node: + */ +void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->uptodate == BTREE_ITER_UPTODATE && + !path->cached && + btree_path_pos_in_node(path, b)) { + enum btree_node_locked_type t = + btree_lock_want(path, b->c.level); + + if (t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(trans, path, b->c.level); + six_lock_increment(&b->c.lock, (enum six_lock_type) t); + mark_btree_node_locked(trans, path, b->c.level, t); + } + + bch2_btree_path_level_init(trans, path, b); + } + + bch2_trans_revalidate_updates_in_node(trans, b); +} + +/* + * A btree node has been modified in such a way as to invalidate iterators - fix + * them: + */ +void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) +{ + struct btree_path *path; + + trans_for_each_path_with_node(trans, b, path) + __btree_path_level_init(path, b->c.level); + + bch2_trans_revalidate_updates_in_node(trans, b); +} + +/* Btree path: traverse, set_pos: */ + +static inline int btree_path_lock_root(struct btree_trans *trans, + struct btree_path *path, + unsigned depth_want, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + enum six_lock_type lock_type; + unsigned i; + int ret; + + EBUG_ON(path->nodes_locked); + + while (1) { + b = READ_ONCE(*rootp); + path->level = READ_ONCE(b->c.level); + + if (unlikely(path->level < depth_want)) { + /* + * the root is at a lower depth than the depth we want: + * got to the end of the btree, or we're walking nodes + * greater than some depth and there are no nodes >= + * that depth + */ + path->level = depth_want; + for (i = path->level; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + return 1; + } + + lock_type = __btree_lock_want(path, path->level); + ret = btree_node_lock(trans, path, &b->c, + path->level, lock_type, trace_ip); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) + continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + BUG(); + } + + if (likely(b == READ_ONCE(*rootp) && + b->c.level == path->level && + !race_fault())) { + for (i = 0; i < path->level; i++) + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); + path->l[path->level].b = b; + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + + mark_btree_node_locked(trans, path, path->level, + (enum btree_node_locked_type) lock_type); + bch2_btree_path_level_init(trans, path, b); + return 0; + } + + six_unlock_type(&b->c.lock, lock_type); + } +} + +noinline +static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_node_iter node_iter = l->iter; + struct bkey_packed *k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr-- && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_node_iter_advance(&node_iter, l->b); + k = bch2_btree_node_iter_peek(&node_iter, l->b); + if (!k) + break; + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); + ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(trans, path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + +static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, + struct btree_and_journal_iter *jiter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr-- && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_and_journal_iter_advance(jiter); + k = bch2_btree_and_journal_iter_peek(jiter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(trans, path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + +static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + struct btree_path *path, + unsigned plevel, struct btree *b) +{ + struct btree_path_level *l = &path->l[plevel]; + bool locked = btree_node_locked(path, plevel); + struct bkey_packed *k; + struct bch_btree_ptr_v2 *bp; + + if (!bch2_btree_node_relock(trans, path, plevel)) + return; + + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); + + bp = (void *) bkeyp_val(&l->b->format, k); + bp->mem_ptr = (unsigned long)b; + + if (!locked) + btree_node_unlock(trans, path, plevel); +} + +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + struct bkey_buf *out) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_and_journal_iter jiter; + struct bkey_s_c k; + int ret = 0; + + __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + + k = bch2_btree_and_journal_iter_peek(&jiter); + + bch2_bkey_buf_reassemble(out, c, k); + + if (flags & BTREE_ITER_PREFETCH) + ret = btree_path_prefetch_j(trans, path, &jiter); + + bch2_btree_and_journal_iter_exit(&jiter); + return ret; +} + +static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree *b; + unsigned level = path->level - 1; + enum six_lock_type lock_type = __btree_lock_want(path, level); + struct bkey_buf tmp; + int ret; + + EBUG_ON(!btree_node_locked(path, path->level)); + + bch2_bkey_buf_init(&tmp); + + if (unlikely(trans->journal_replay_not_finished)) { + ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + if (ret) + goto err; + } else { + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (flags & BTREE_ITER_PREFETCH) { + ret = btree_path_prefetch(trans, path); + if (ret) + goto err; + } + } + + b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) + goto err; + + if (likely(!trans->journal_replay_not_finished && + tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && + unlikely(b != btree_node_mem_ptr(tmp.k))) + btree_node_mem_ptr_set(trans, path, level + 1, b); + + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + mark_btree_node_locked(trans, path, level, + (enum btree_node_locked_type) lock_type); + path->level = level; + bch2_btree_path_level_init(trans, path, b); + + bch2_btree_path_verify_locks(path); +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + + +static int bch2_btree_path_traverse_all(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; + unsigned long trace_ip = _RET_IP_; + int i, ret = 0; + + if (trans->in_traverse_all) + return -BCH_ERR_transaction_restart_in_traverse_all; + + trans->in_traverse_all = true; +retry_all: + trans->restarted = 0; + trans->last_restarted_ip = 0; + + trans_for_each_path(trans, path) + path->should_be_locked = false; + + btree_trans_sort_paths(trans); + + bch2_trans_unlock(trans); + cond_resched(); + + if (unlikely(trans->memory_allocation_failure)) { + struct closure cl; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + } + + /* Now, redo traversals in correct order: */ + i = 0; + while (i < trans->nr_sorted) { + path = trans->paths + trans->sorted[i]; + + /* + * Traversing a path can cause another path to be added at about + * the same position: + */ + if (path->uptodate) { + __btree_path_get(path, false); + ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); + __btree_path_put(path, false); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, ENOMEM)) + goto retry_all; + if (ret) + goto err; + } else { + i++; + } + } + + /* + * We used to assert that all paths had been traversed here + * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since + * path->should_be_locked is not set yet, we might have unlocked and + * then failed to relock a path - that's fine. + */ +err: + bch2_btree_cache_cannibalize_unlock(c); + + trans->in_traverse_all = false; + + trace_and_count(c, trans_traverse_all, trans, trace_ip); + return ret; +} + +static inline bool btree_path_check_pos_in_node(struct btree_path *path, + unsigned l, int check_pos) +{ + if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) + return false; + if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) + return false; + return true; +} + +static inline bool btree_path_good_node(struct btree_trans *trans, + struct btree_path *path, + unsigned l, int check_pos) +{ + return is_btree_node(path, l) && + bch2_btree_node_relock(trans, path, l) && + btree_path_check_pos_in_node(path, l, check_pos); +} + +static void btree_path_set_level_down(struct btree_trans *trans, + struct btree_path *path, + unsigned new_level) +{ + unsigned l; + + path->level = new_level; + + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, l); + + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_path_verify(trans, path); +} + +static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, + int check_pos) +{ + unsigned i, l = path->level; +again: + while (btree_path_node(path, l) && + !btree_path_good_node(trans, path, l, check_pos)) + __btree_path_set_level_up(trans, path, l++); + + /* If we need intent locks, take them too: */ + for (i = l + 1; + i < path->locks_want && btree_path_node(path, i); + i++) + if (!bch2_btree_node_relock(trans, path, i)) { + while (l <= i) + __btree_path_set_level_up(trans, path, l++); + goto again; + } + + return l; +} + +static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, + int check_pos) +{ + return likely(btree_node_locked(path, path->level) && + btree_path_check_pos_in_node(path, path->level, check_pos)) + ? path->level + : __btree_path_up_until_good_node(trans, path, check_pos); +} + +/* + * This is the main state machine for walking down the btree - walks down to a + * specified depth + * + * Returns 0 on success, -EIO on error (error reading in a btree node). + * + * On error, caller (peek_node()/peek_key()) must return NULL; the error is + * stashed in the iterator and returned from bch2_trans_exit(). + */ +int bch2_btree_path_traverse_one(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + unsigned long trace_ip) +{ + unsigned depth_want = path->level; + int ret = -((int) trans->restarted); + + if (unlikely(ret)) + goto out; + + if (unlikely(!trans->srcu_held)) + bch2_trans_srcu_lock(trans); + + /* + * Ensure we obey path->should_be_locked: if it's set, we can't unlock + * and re-traverse the path without a transaction restart: + */ + if (path->should_be_locked) { + ret = bch2_btree_path_relock(trans, path, trace_ip); + goto out; + } + + if (path->cached) { + ret = bch2_btree_path_traverse_cached(trans, path, flags); + goto out; + } + + if (unlikely(path->level >= BTREE_MAX_DEPTH)) + goto out; + + path->level = btree_path_up_until_good_node(trans, path, 0); + + EBUG_ON(btree_path_node(path, path->level) && + !btree_node_locked(path, path->level)); + + /* + * Note: path->nodes[path->level] may be temporarily NULL here - that + * would indicate to other code that we got to the end of the btree, + * here it indicates that relocking the root failed - it's critical that + * btree_path_lock_root() comes next and that it can't fail + */ + while (path->level > depth_want) { + ret = btree_path_node(path, path->level) + ? btree_path_down(trans, path, flags, trace_ip) + : btree_path_lock_root(trans, path, depth_want, trace_ip); + if (unlikely(ret)) { + if (ret == 1) { + /* + * No nodes at this level - got to the end of + * the btree: + */ + ret = 0; + goto out; + } + + __bch2_btree_path_unlock(trans, path); + path->level = depth_want; + path->l[path->level].b = ERR_PTR(ret); + goto out; + } + } + + path->uptodate = BTREE_ITER_UPTODATE; +out: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) + panic("ret %s (%i) trans->restarted %s (%i)\n", + bch2_err_str(ret), ret, + bch2_err_str(trans->restarted), trans->restarted); + bch2_btree_path_verify(trans, path); + return ret; +} + +static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, + struct btree_path *src) +{ + unsigned i, offset = offsetof(struct btree_path, pos); + + memcpy((void *) dst + offset, + (void *) src + offset, + sizeof(struct btree_path) - offset); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) { + unsigned t = btree_node_locked_type(dst, i); + + if (t != BTREE_NODE_UNLOCKED) + six_lock_increment(&dst->l[i].b->c.lock, t); + } +} + +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, + bool intent) +{ + struct btree_path *new = btree_path_alloc(trans, src); + + btree_path_copy(trans, new, src); + __btree_path_get(new, intent); + return new; +} + +__flatten +struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ + __btree_path_put(path, intent); + path = btree_path_clone(trans, path, intent); + path->preserve = false; + return path; +} + +struct btree_path * __must_check +__bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent, unsigned long ip, int cmp) +{ + unsigned level = path->level; + + bch2_trans_verify_not_in_restart(trans); + EBUG_ON(!path->ref); + + path = bch2_btree_path_make_mut(trans, path, intent, ip); + + path->pos = new_pos; + trans->paths_sorted = false; + + if (unlikely(path->cached)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + goto out; + } + + level = btree_path_up_until_good_node(trans, path, cmp); + + if (btree_path_node(path, level)) { + struct btree_path_level *l = &path->l[level]; + + BUG_ON(!btree_node_locked(path, level)); + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too + * many keys just reinit it (or if we're rewinding, since that + * is expensive). + */ + if (cmp < 0 || + !btree_path_advance_to_pos(path, l, 8)) + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (unlikely(level)) + bch2_btree_node_iter_peek(&l->iter, l->b); + } + + if (unlikely(level != path->level)) { + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + __bch2_btree_path_unlock(trans, path); + } +out: + bch2_btree_path_verify(trans, path); + return path; +} + +/* Btree path: main interface: */ + +static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) +{ + struct btree_path *sib; + + sib = prev_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; + + sib = next_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; + + return NULL; +} + +static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) +{ + struct btree_path *sib; + + sib = prev_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; + + sib = next_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; + + return NULL; +} + +static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +{ + __bch2_btree_path_unlock(trans, path); + btree_path_list_remove(trans, path); + trans->paths_allocated &= ~(1ULL << path->idx); +} + +void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +{ + struct btree_path *dup; + + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) + return; + + dup = path->preserve + ? have_path_at_pos(trans, path) + : have_node_at_pos(trans, path); + + if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + return; + + if (path->should_be_locked && + !trans->restarted && + (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + return; + + if (dup) { + dup->preserve |= path->preserve; + dup->should_be_locked |= path->should_be_locked; + } + + __bch2_path_free(trans, path); +} + +static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, + bool intent) +{ + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) + return; + + __bch2_path_free(trans, path); +} + +void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +{ + panic("trans->restart_count %u, should be %u, last restarted by %pS\n", + trans->restart_count, restart_count, + (void *) trans->last_begin_ip); +} + +void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) +{ + panic("in transaction restart: %s, last restarted by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); +} + +noinline __cold +void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) +{ + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + + prt_printf(buf, "transaction updates for %s journal seq %llu", + trans->fn, trans->journal_res.seq); + prt_newline(buf); + printbuf_indent_add(buf, 2); + + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + + prt_printf(buf, "update: btree=%s cached=%u %pS", + bch2_btree_id_str(i->btree_id), + i->cached, + (void *) i->ip_allocated); + prt_newline(buf); + + prt_printf(buf, " old "); + bch2_bkey_val_to_text(buf, trans->c, old); + prt_newline(buf); + + prt_printf(buf, " new "); + bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); + prt_newline(buf); + } + + trans_for_each_wb_update(trans, wb) { + prt_printf(buf, "update: btree=%s wb=1 %pS", + bch2_btree_id_str(wb->btree), + (void *) i->ip_allocated); + prt_newline(buf); + + prt_printf(buf, " new "); + bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); + prt_newline(buf); + } + + printbuf_indent_sub(buf, 2); +} + +noinline __cold +void bch2_dump_trans_updates(struct btree_trans *trans) +{ + struct printbuf buf = PRINTBUF; + + bch2_trans_updates_to_text(&buf, trans); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline __cold +void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +{ + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + path->idx, path->ref, path->intent_ref, + path->preserve ? 'P' : ' ', + path->should_be_locked ? 'S' : ' ', + bch2_btree_id_str(path->btree_id), + path->level); + bch2_bpos_to_text(out, path->pos); + + prt_printf(out, " locks %u", path->nodes_locked); +#ifdef TRACK_PATH_ALLOCATED + prt_printf(out, " %pS", (void *) path->ip_allocated); +#endif + prt_newline(out); +} + +static noinline __cold +void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, + bool nosort) +{ + struct btree_path *path; + unsigned idx; + + if (!nosort) + btree_trans_sort_paths(trans); + + trans_for_each_path_inorder(trans, path, idx) + bch2_btree_path_to_text(out, path); +} + +noinline __cold +void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +{ + __bch2_trans_paths_to_text(out, trans, false); +} + +static noinline __cold +void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) +{ + struct printbuf buf = PRINTBUF; + + __bch2_trans_paths_to_text(&buf, trans, nosort); + bch2_trans_updates_to_text(&buf, trans); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline __cold +void bch2_dump_trans_paths_updates(struct btree_trans *trans) +{ + __bch2_dump_trans_paths_updates(trans, false); +} + +noinline __cold +static void bch2_trans_update_max_paths(struct btree_trans *trans) +{ + struct btree_transaction_stats *s = btree_trans_stats(trans); + struct printbuf buf = PRINTBUF; + + if (!s) + return; + + bch2_trans_paths_to_text(&buf, trans); + + if (!buf.allocation_failure) { + mutex_lock(&s->lock); + if (s->nr_max_paths < hweight64(trans->paths_allocated)) { + s->nr_max_paths = trans->nr_max_paths = + hweight64(trans->paths_allocated); + swap(s->max_paths_text, buf.buf); + } + mutex_unlock(&s->lock); + } + + printbuf_exit(&buf); + + trans->nr_max_paths = hweight64(trans->paths_allocated); +} + +static noinline void btree_path_overflow(struct btree_trans *trans) +{ + bch2_dump_trans_paths_updates(trans); + panic("trans path overflow\n"); +} + +static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *pos) +{ + struct btree_path *path; + unsigned idx; + + if (unlikely(trans->paths_allocated == + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + btree_path_overflow(trans); + + idx = __ffs64(~trans->paths_allocated); + + /* + * Do this before marking the new path as allocated, since it won't be + * initialized yet: + */ + if (unlikely(idx > trans->nr_max_paths)) + bch2_trans_update_max_paths(trans); + + trans->paths_allocated |= 1ULL << idx; + + path = &trans->paths[idx]; + path->idx = idx; + path->ref = 0; + path->intent_ref = 0; + path->nodes_locked = 0; + path->alloc_seq++; + + btree_path_list_add(trans, pos, path); + trans->paths_sorted = false; + return path; +} + +struct btree_path *bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + unsigned flags, unsigned long ip) +{ + struct btree_path *path, *path_pos = NULL; + bool cached = flags & BTREE_ITER_CACHED; + bool intent = flags & BTREE_ITER_INTENT; + int i; + + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_locks(trans); + + btree_trans_sort_paths(trans); + + trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, + btree_id, + cached, + pos, + level) > 0) + break; + + path_pos = path; + } + + if (path_pos && + path_pos->cached == cached && + path_pos->btree_id == btree_id && + path_pos->level == level) { + __btree_path_get(path_pos, intent); + path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + } else { + path = btree_path_alloc(trans, path_pos); + path_pos = NULL; + + __btree_path_get(path, intent); + path->pos = pos; + path->btree_id = btree_id; + path->cached = cached; + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + path->should_be_locked = false; + path->level = level; + path->locks_want = locks_want; + path->nodes_locked = 0; + for (i = 0; i < ARRAY_SIZE(path->l); i++) + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); +#ifdef TRACK_PATH_ALLOCATED + path->ip_allocated = ip; +#endif + trans->paths_sorted = false; + } + + if (!(flags & BTREE_ITER_NOPRESERVE)) + path->preserve = true; + + if (path->intent_ref) + locks_want = max(locks_want, level + 1); + + /* + * If the path has locks_want greater than requested, we don't downgrade + * it here - on transaction restart because btree node split needs to + * upgrade locks, we might be putting/getting the iterator again. + * Downgrading iterators only happens via bch2_trans_downgrade(), after + * a successful transaction commit. + */ + + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > path->locks_want) + bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); + + return path; +} + +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) +{ + + struct btree_path_level *l = path_l(path); + struct bkey_packed *_k; + struct bkey_s_c k; + + if (unlikely(!l->b)) + return bkey_s_c_null; + + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + EBUG_ON(!btree_node_locked(path, path->level)); + + if (!path->cached) { + _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; + + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); + + if (!k.k || !bpos_eq(path->pos, k.k->p)) + goto hole; + } else { + struct bkey_cached *ck = (void *) path->l[0].b; + + EBUG_ON(ck && + (path->btree_id != ck->key.btree_id || + !bkey_eq(path->pos, ck->key.pos))); + if (!ck || !ck->valid) + return bkey_s_c_null; + + *u = ck->k->k; + k = bkey_i_to_s_c(ck->k); + } + + return k; +hole: + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + +/* Btree iterators: */ + +int __must_check +__bch2_btree_iter_traverse(struct btree_iter *iter) +{ + return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); +} + +int __must_check +bch2_btree_iter_traverse(struct btree_iter *iter) +{ + int ret; + + iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + btree_iter_search_key(iter), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + if (ret) + return ret; + + btree_path_set_should_be_locked(iter->path); + return 0; +} + +/* Iterate across nodes (leaf and interior nodes) */ + +struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct btree *b = NULL; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) + goto err; + + b = btree_path_node(iter->path, iter->path->level); + if (!b) + goto out; + + BUG_ON(bpos_lt(b->key.k.p, iter->pos)); + + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + btree_path_set_should_be_locked(iter->path); +out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + + return b; +err: + b = ERR_PTR(ret); + goto out; +} + +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) +{ + struct btree *b; + + while (b = bch2_btree_iter_peek_node(iter), + bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) + bch2_trans_begin(iter->trans); + + return b; +} + +struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct btree_path *path = iter->path; + struct btree *b = NULL; + int ret; + + bch2_trans_verify_not_in_restart(trans); + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + /* already at end? */ + if (!btree_path_node(path, path->level)) + return NULL; + + /* got to end? */ + if (!btree_path_node(path, path->level + 1)) { + btree_path_set_level_up(trans, path); + return NULL; + } + + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + __bch2_btree_path_unlock(trans, path); + path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + goto err; + } + + b = btree_path_node(path, path->level + 1); + + if (bpos_eq(iter->pos, b->key.k.p)) { + __btree_path_set_level_up(trans, path, path->level++); + } else { + /* + * Haven't gotten to the end of the parent node: go back down to + * the next child node + */ + path = iter->path = + bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + btree_path_set_level_down(trans, path, iter->min_depth); + + ret = bch2_btree_path_traverse(trans, path, iter->flags); + if (ret) + goto err; + + b = path->l[path->level].b; + } + + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + btree_path_set_should_be_locked(iter->path); + BUG_ON(iter->path->uptodate); +out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + + return b; +err: + b = ERR_PTR(ret); + goto out; +} + +/* Iterate across keys (in leaf nodes only) */ + +inline bool bch2_btree_iter_advance(struct btree_iter *iter) +{ + if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { + struct bpos pos = iter->k.p; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, SPOS_MAX) + : bkey_eq(pos, SPOS_MAX)); + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; + } else { + if (!btree_path_node(iter->path, iter->path->level)) + return true; + + iter->advanced = true; + return false; + } +} + +inline bool bch2_btree_iter_rewind(struct btree_iter *iter) +{ + struct bpos pos = bkey_start_pos(&iter->k); + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, POS_MIN) + : bkey_eq(pos, POS_MIN)); + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_predecessor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; +} + +static noinline +struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) +{ + struct btree_insert_entry *i; + struct bkey_i *ret = NULL; + + trans_for_each_update(iter->trans, i) { + if (i->btree_id < iter->btree_id) + continue; + if (i->btree_id > iter->btree_id) + break; + if (bpos_lt(i->k->k.p, iter->path->pos)) + continue; + if (i->key_cache_already_flushed) + continue; + if (!ret || bpos_lt(i->k->k.p, ret->k.p)) + ret = i->k; + } + + return ret; +} + +static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) +{ + return iter->flags & BTREE_ITER_WITH_UPDATES + ? __bch2_btree_trans_peek_updates(iter) + : NULL; +} + +static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end_pos) +{ + struct bkey_i *k; + + if (bpos_lt(iter->path->pos, iter->journal_pos)) + iter->journal_idx = 0; + + k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + iter->path->level, + iter->path->pos, + end_pos, + &iter->journal_idx); + + iter->journal_pos = k ? k->k.p : end_pos; + return k; +} + +static noinline +struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + + if (k) { + iter->k = k->k; + return bkey_i_to_s_c(k); + } else { + return bkey_s_c_null; + } +} + +static noinline +struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *next_journal = + bch2_btree_journal_peek(trans, iter, + k.k ? k.k->p : path_l(iter->path)->b->key.k.p); + + if (next_journal) { + iter->k = next_journal->k; + k = bkey_i_to_s_c(next_journal); + } + + return k; +} + +/* + * Checks btree key cache for key at iter->pos and returns it if present, or + * bkey_s_c_null: + */ +static noinline +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +{ + struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k; + int ret; + + if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bpos_eq(iter->pos, pos)) + return bkey_s_c_null; + + if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) + return bkey_s_c_null; + + if (!iter->key_cache_path) + iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, + iter->flags & BTREE_ITER_INTENT, 0, + iter->flags|BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL, + _THIS_IP_); + + iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + iter->flags|BTREE_ITER_CACHED) ?: + bch2_btree_path_relock(trans, iter->path, _THIS_IP_); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + btree_path_set_should_be_locked(iter->key_cache_path); + + k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + if (k.k && !bkey_err(k)) { + iter->k = u; + k.k = &iter->k; + } + return k; +} + +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +{ + struct btree_trans *trans = iter->trans; + struct bkey_i *next_update; + struct bkey_s_c k, k2; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + while (1) { + struct btree_path_level *l; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out; + } + + l = path_l(iter->path); + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; + } + + btree_path_set_should_be_locked(iter->path); + + k = btree_path_level_peek_all(trans->c, l, &iter->k); + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; + ret = bkey_err(k); + if (ret) { + bch2_btree_iter_set_pos(iter, iter->pos); + goto out; + } + } + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + k = btree_trans_peek_journal(trans, iter, k); + + next_update = btree_trans_peek_updates(iter); + + if (next_update && + bpos_le(next_update->k.p, + k.k ? k.k->p : l->b->key.k.p)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } + + if (k.k && bkey_deleted(k.k)) { + /* + * If we've got a whiteout, and it's after the search + * key, advance the search key to the whiteout instead + * of just after the whiteout - it might be a btree + * whiteout, with a real key at the same position, since + * in the btree deleted keys sort before non deleted. + */ + search_key = !bpos_eq(search_key, k.k->p) + ? k.k->p + : bpos_successor(k.k->p); + continue; + } + + if (likely(k.k)) { + break; + } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ + search_key = bpos_successor(l->b->key.k.p); + } else { + /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; + } + } +out: + bch2_btree_iter_verify(iter); + + return k; +} + +/** + * bch2_btree_iter_peek_upto() - returns first key greater than or equal to + * iterator's current position + * @iter: iterator to peek from + * @end: search limit: returns keys less than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; + struct bpos iter_pos; + int ret; + + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); + + if (iter->update_path) { + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + bch2_btree_iter_verify_entry_exit(iter); + + while (1) { + k = __bch2_btree_iter_peek(iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) + goto out_no_locked; + + /* + * iter->pos should be mononotically increasing, and always be + * equal to the key we just returned - except extents can + * straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + iter_pos = k.k->p; + else + iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); + + if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_gt(iter_pos, end) + : bkey_ge(iter_pos, end))) + goto end; + + if (iter->update_path && + !bkey_eq(iter->update_path->pos, k.k->p)) { + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + (iter->flags & BTREE_ITER_INTENT) && + !(iter->flags & BTREE_ITER_IS_EXTENTS) && + !iter->update_path) { + struct bpos pos = k.k->p; + + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } + + pos.snapshot = iter->snapshot; + + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + } + + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; + } + + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } + + break; + } + + iter->pos = iter_pos; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + btree_path_set_should_be_locked(iter->path); +out_no_locked: + if (iter->update_path) { + ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + if (unlikely(ret)) + k = bkey_s_c_err(ret); + else + btree_path_set_should_be_locked(iter->update_path); + } + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + iter->pos.snapshot = iter->snapshot; + + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) { + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + } + + bch2_btree_iter_verify_entry_exit(iter); + + return k; +end: + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; +} + +/** + * bch2_btree_iter_peek_all_levels() - returns the first key greater than or + * equal to iterator's current position, returning keys from every level of the + * btree. For keys at different levels of the btree that compare equal, the key + * from the lower level (leaf) is returned first. + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bkey_s_c k; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + BUG_ON(iter->path->level < iter->min_depth); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + /* Already at end? */ + if (!btree_path_node(iter->path, iter->path->level)) { + k = bkey_s_c_null; + goto out_no_locked; + } + + k = btree_path_level_peek_all(trans->c, + &iter->path->l[iter->path->level], &iter->k); + + /* Check if we should go up to the parent node: */ + if (!k.k || + (iter->advanced && + bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { + iter->pos = path_l(iter->path)->b->key.k.p; + btree_path_set_level_up(trans, iter->path); + iter->advanced = false; + continue; + } + + /* + * Check if we should go back down to a leaf: + * If we're not in a leaf node, we only return the current key + * if it exactly matches iter->pos - otherwise we first have to + * go back to the leaf: + */ + if (iter->path->level != iter->min_depth && + (iter->advanced || + !k.k || + !bpos_eq(iter->pos, k.k->p))) { + btree_path_set_level_down(trans, iter->path, iter->min_depth); + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + /* Check if we should go to the next key: */ + if (iter->path->level == iter->min_depth && + iter->advanced && + k.k && + bpos_eq(iter->pos, k.k->p)) { + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + if (iter->advanced && + iter->path->level == iter->min_depth && + !bpos_eq(k.k->p, iter->pos)) + iter->advanced = false; + + BUG_ON(iter->advanced); + BUG_ON(!k.k); + break; + } + + iter->pos = k.k->p; + btree_path_set_should_be_locked(iter->path); +out_no_locked: + bch2_btree_iter_verify(iter); + + return k; +} + +/** + * bch2_btree_iter_next() - returns first key greater than iterator's current + * position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +{ + if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek(iter); +} + +/** + * bch2_btree_iter_peek_prev() - returns first key less than or equal to + * iterator's current position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct btree_path *saved_path = NULL; + struct bkey_s_c k; + struct bkey saved_k; + const struct bch_val *saved_v; + int ret; + + EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + + if (iter->flags & BTREE_ITER_WITH_JOURNAL) + return bkey_s_c_err(-EIO); + + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + k = btree_path_level_peek(trans, iter->path, + &iter->path->l[0], &iter->k); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bpos_ge(bkey_start_pos(k.k), search_key) + : bpos_gt(k.k->p, search_key))) + k = btree_path_level_prev(trans, iter->path, + &iter->path->l[0], &iter->k); + + if (likely(k.k)) { + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (k.k->p.snapshot == iter->snapshot) + goto got_key; + + /* + * If we have a saved candidate, and we're no + * longer at the same _key_ (not pos), return + * that candidate + */ + if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { + bch2_path_put_nokeep(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + iter->path = saved_path; + saved_path = NULL; + iter->k = saved_k; + k.v = saved_v; + goto got_key; + } + + if (bch2_snapshot_is_ancestor(iter->trans->c, + iter->snapshot, + k.k->p.snapshot)) { + if (saved_path) + bch2_path_put_nokeep(trans, saved_path, + iter->flags & BTREE_ITER_INTENT); + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + saved_k = *k.k; + saved_v = k.v; + } + + search_key = bpos_predecessor(k.k->p); + continue; + } +got_key: + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_predecessor(iter, k.k->p); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + continue; + } + + break; + } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ + search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); + k = bkey_s_c_null; + goto out_no_locked; + } + } + + EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); + + /* Extents can straddle iter->pos: */ + if (bkey_lt(k.k->p, iter->pos)) + iter->pos = k.k->p; + + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; + + btree_path_set_should_be_locked(iter->path); +out_no_locked: + if (saved_path) + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + + return k; +} + +/** + * bch2_btree_iter_prev() - returns first key less than iterator's current + * position + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +{ + if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_prev(iter); +} + +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key; + struct bkey_s_c k; + int ret; + + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + + /* extents can't span inode numbers: */ + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + } + + search_key = btree_iter_search_key(iter); + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + if ((iter->flags & BTREE_ITER_CACHED) || + !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + struct bkey_i *next_update; + + if ((next_update = btree_trans_peek_updates(iter)) && + bpos_eq(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + goto out; + } + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + (k = btree_trans_peek_slot_journal(trans, iter)).k) + goto out; + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + if (!bkey_err(k)) + iter->k = *k.k; + /* We're not returning a key from iter->path: */ + goto out_no_locked; + } + + k = bch2_btree_path_peek_slot(iter->path, &iter->k); + if (unlikely(!k.k)) + goto out_no_locked; + } else { + struct bpos next; + struct bpos end = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + end.offset = U64_MAX; + + EBUG_ON(iter->path->level); + + if (iter->flags & BTREE_ITER_INTENT) { + struct btree_iter iter2; + + bch2_trans_copy_iter(&iter2, iter); + k = bch2_btree_iter_peek_upto(&iter2, end); + + if (k.k && !bkey_err(k)) { + iter->k = iter2.k; + k.k = &iter->k; + } + bch2_trans_iter_exit(trans, &iter2); + } else { + struct bpos pos = iter->pos; + + k = bch2_btree_iter_peek_upto(iter, end); + if (unlikely(bkey_err(k))) + bch2_btree_iter_set_pos(iter, pos); + else + iter->pos = pos; + } + + if (unlikely(bkey_err(k))) + goto out_no_locked; + + next = k.k ? bkey_start_pos(k.k) : POS_MAX; + + if (bkey_lt(iter->pos, next)) { + bkey_init(&iter->k); + iter->k.p = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) { + bch2_key_resize(&iter->k, + min_t(u64, KEY_SIZE_MAX, + (next.inode == iter->pos.inode + ? next.offset + : KEY_OFFSET_MAX) - + iter->pos.offset)); + EBUG_ON(!iter->k.size); + } + + k = (struct bkey_s_c) { &iter->k, NULL }; + } + } +out: + btree_path_set_should_be_locked(iter->path); +out_no_locked: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + return k; +} + +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) +{ + if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +} + +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) +{ + if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +} + +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(iter->trans) || + (k = bch2_btree_iter_peek_type(iter, iter->flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(iter->trans); + + return k; +} + +/* new transactional stuff: */ + +#ifdef CONFIG_BCACHEFS_DEBUG +static void btree_trans_verify_sorted_refs(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i; + + BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + + trans_for_each_path(trans, path) { + BUG_ON(path->sorted_idx >= trans->nr_sorted); + BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + } + + for (i = 0; i < trans->nr_sorted; i++) { + unsigned idx = trans->sorted[i]; + + EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(trans->paths[idx].sorted_idx != i); + } +} + +static void btree_trans_verify_sorted(struct btree_trans *trans) +{ + struct btree_path *path, *prev = NULL; + unsigned i; + + if (!bch2_debug_check_iterators) + return; + + trans_for_each_path_inorder(trans, path, i) { + if (prev && btree_path_cmp(prev, path) > 0) { + __bch2_dump_trans_paths_updates(trans, true); + panic("trans paths out of order!\n"); + } + prev = path; + } +} +#else +static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} +static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} +#endif + +void __bch2_btree_trans_sort_paths(struct btree_trans *trans) +{ + int i, l = 0, r = trans->nr_sorted, inc = 1; + bool swapped; + + btree_trans_verify_sorted_refs(trans); + + if (trans->paths_sorted) + goto out; + + /* + * Cocktail shaker sort: this is efficient because iterators will be + * mostly sorted. + */ + do { + swapped = false; + + for (i = inc > 0 ? l : r - 2; + i + 1 < r && i >= l; + i += inc) { + if (btree_path_cmp(trans->paths + trans->sorted[i], + trans->paths + trans->sorted[i + 1]) > 0) { + swap(trans->sorted[i], trans->sorted[i + 1]); + trans->paths[trans->sorted[i]].sorted_idx = i; + trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; + swapped = true; + } + } + + if (inc > 0) + --r; + else + l++; + inc = -inc; + } while (swapped); + + trans->paths_sorted = true; +out: + btree_trans_verify_sorted(trans); +} + +static inline void btree_path_list_remove(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned i; + + EBUG_ON(path->sorted_idx >= trans->nr_sorted); +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + trans->nr_sorted--; + memmove_u64s_down_small(trans->sorted + path->sorted_idx, + trans->sorted + path->sorted_idx + 1, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); +#else + array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); +#endif + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; + + path->sorted_idx = U8_MAX; +} + +static inline void btree_path_list_add(struct btree_trans *trans, + struct btree_path *pos, + struct btree_path *path) +{ + unsigned i; + + path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; + +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, + trans->sorted + path->sorted_idx, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + trans->nr_sorted++; + trans->sorted[path->sorted_idx] = path->idx; +#else + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); +#endif + + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; + + btree_trans_verify_sorted_refs(trans); +} + +void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) +{ + if (iter->update_path) + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + if (iter->key_cache_path) + bch2_path_put(trans, iter->key_cache_path, + iter->flags & BTREE_ITER_INTENT); + iter->path = NULL; + iter->update_path = NULL; + iter->key_cache_path = NULL; +} + +void bch2_trans_iter_init_outlined(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_id btree_id, struct bpos pos, + unsigned flags) +{ + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, flags), + _RET_IP_); +} + +void bch2_trans_node_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_id btree_id, + struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags) +{ + flags |= BTREE_ITER_NOT_EXTENTS; + flags |= __BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_ALL_SNAPSHOTS; + + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, + __bch2_btree_iter_flags(trans, btree_id, flags), + _RET_IP_); + + iter->min_depth = depth; + + BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->path->level != depth); + BUG_ON(iter->min_depth != depth); +} + +void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +{ + *dst = *src; + if (src->path) + __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + if (src->update_path) + __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = NULL; +} + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + unsigned new_top = trans->mem_top + size; + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(new_top); + int ret; + void *new_mem; + void *p; + + trans->mem_max = max(trans->mem_max, new_top); + + WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + kfree(trans->mem); + } + + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + } + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + if (old_bytes) { + trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + } + + p = trans->mem + trans->mem_top; + trans->mem_top += size; + memset(p, 0, size); + return p; +} + +static inline void check_srcu_held_too_long(struct btree_trans *trans) +{ + WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10), + "btree trans held srcu lock (delaying memory reclaim) for %lu seconds", + (jiffies - trans->srcu_lock_time) / HZ); +} + +void bch2_trans_srcu_unlock(struct btree_trans *trans) +{ + if (trans->srcu_held) { + struct bch_fs *c = trans->c; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->cached && !btree_node_locked(path, 0)) + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); + + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + trans->srcu_held = false; + } +} + +void bch2_trans_srcu_lock(struct btree_trans *trans) +{ + if (!trans->srcu_held) { + trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; + } +} + +/** + * bch2_trans_begin() - reset a transaction after a interrupted attempt + * @trans: transaction to reset + * + * Returns: current restart counter, to be used with trans_was_restarted() + * + * While iterating over nodes or updating nodes a attempt to lock a btree node + * may return BCH_ERR_transaction_restart when the trylock fails. When this + * occurs bch2_trans_begin() should be called and the transaction retried. + */ +u32 bch2_trans_begin(struct btree_trans *trans) +{ + struct btree_path *path; + u64 now; + + bch2_trans_reset_updates(trans); + + trans->restart_count++; + trans->mem_top = 0; + + trans_for_each_path(trans, path) { + path->should_be_locked = false; + + /* + * If the transaction wasn't restarted, we're presuming to be + * doing something new: dont keep iterators excpt the ones that + * are in use - except for the subvolumes btree: + */ + if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) + path->preserve = false; + + /* + * XXX: we probably shouldn't be doing this if the transaction + * was restarted, but currently we still overflow transaction + * iterators if we do that + */ + if (!path->ref && !path->preserve) + __bch2_path_free(trans, path); + else + path->preserve = false; + } + + now = local_clock(); + if (!trans->restarted && + (need_resched() || + now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + drop_locks_do(trans, (cond_resched(), 0)); + now = local_clock(); + } + trans->last_begin_time = now; + + if (unlikely(trans->srcu_held && + time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) + bch2_trans_srcu_unlock(trans); + + trans->last_begin_ip = _RET_IP_; + if (trans->restarted) { + bch2_btree_path_traverse_all(trans); + trans->notrace_relock_fail = false; + } + + return trans->restart_count; +} + +static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) +{ + struct btree_trans *trans; + + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) + return trans; + } + + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); + /* + * paths need to be zeroed, bch2_check_for_deadlock looks at + * paths in other threads + */ + memset(&trans->paths, 0, sizeof(trans->paths)); + return trans; +} + +const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; + +unsigned bch2_trans_get_fn_idx(const char *fn) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) + if (!bch2_btree_transaction_fns[i] || + bch2_btree_transaction_fns[i] == fn) { + bch2_btree_transaction_fns[i] = fn; + return i; + } + + pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); + return i; +} + +struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) + __acquires(&c->btree_trans_barrier) +{ + struct btree_trans *trans; + struct btree_transaction_stats *s; + + trans = bch2_trans_alloc(c); + + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) + ? bch2_btree_transaction_fns[fn_idx] : NULL; + trans->last_begin_time = local_clock(); + trans->fn_idx = fn_idx; + trans->locking_wait.task = current; + trans->journal_replay_not_finished = + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); + closure_init_stack(&trans->ref); + + s = btree_trans_stats(trans); + if (s && s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + + if (!unlikely(trans->mem)) { + trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); + trans->mem_bytes = BTREE_TRANS_MEM_MAX; + } else { + trans->mem_bytes = expected_mem_bytes; + } + } + + if (s) { + trans->nr_max_paths = s->nr_max_paths; + trans->wb_updates_size = s->wb_updates_size; + } + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + struct btree_trans *pos; + + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(pos, &c->btree_trans_list, list) { + /* + * We'd much prefer to be stricter here and completely + * disallow multiple btree_trans in the same thread - + * but the data move path calls bch2_write when we + * already have a btree_trans initialized. + */ + BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && + bch2_trans_locked(pos)); + + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } + } + list_add_tail(&trans->list, &c->btree_trans_list); +list_add_done: + seqmutex_unlock(&c->btree_trans_lock); + } + + return trans; +} + +static void check_btree_paths_leaked(struct btree_trans *trans) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->ref) + goto leaked; + return; +leaked: + bch_err(c, "btree paths leaked from %s!", trans->fn); + trans_for_each_path(trans, path) + if (path->ref) + printk(KERN_ERR " btree %s %pS\n", + bch2_btree_id_str(path->btree_id), + (void *) path->ip_allocated); + /* Be noisy about this: */ + bch2_fatal_error(c); +#endif +} + +void bch2_trans_put(struct btree_trans *trans) + __releases(&c->btree_trans_barrier) +{ + struct btree_insert_entry *i; + struct bch_fs *c = trans->c; + struct btree_transaction_stats *s = btree_trans_stats(trans); + + bch2_trans_unlock(trans); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + + closure_sync(&trans->ref); + + if (s) + s->max_mem = max(s->max_mem, trans->mem_max); + + trans_for_each_update(trans, i) + __btree_path_put(i->path, true); + trans->nr_updates = 0; + + check_btree_paths_leaked(trans); + + if (trans->srcu_held) { + check_srcu_held_too_long(trans); + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + } + + kfree(trans->extra_journal_entries.data); + + if (trans->fs_usage_deltas) { + if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == + REPLICAS_DELTA_LIST_MAX) + mempool_free(trans->fs_usage_deltas, + &c->replicas_delta_pool); + else + kfree(trans->fs_usage_deltas); + } + + if (unlikely(trans->journal_replay_not_finished)) + bch2_journal_keys_put(c); + + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + mempool_free(trans->mem, &c->btree_trans_mem_pool); + else + kfree(trans->mem); + + /* Userspace doesn't have a real percpu implementation: */ + if (IS_ENABLED(__KERNEL__)) + trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); + if (trans) + mempool_free(trans, &c->btree_trans_pool); +} + +static void __maybe_unused +bch2_btree_bkey_cached_common_to_text(struct printbuf *out, + struct btree_bkey_cached_common *b) +{ + struct six_lock_count c = six_lock_counts(&b->lock); + struct task_struct *owner; + pid_t pid; + + rcu_read_lock(); + owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + rcu_read_unlock(); + + prt_tab(out); + prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + b->level, bch2_btree_id_str(b->btree_id)); + bch2_bpos_to_text(out, btree_node_pos(b)); + + prt_tab(out); + prt_printf(out, " locks %u:%u:%u held by pid %u", + c.n[0], c.n[1], c.n[2], pid); +} + +void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) +{ + struct btree_path *path; + struct btree_bkey_cached_common *b; + static char lock_types[] = { 'r', 'i', 'w' }; + unsigned l, idx; + + if (!out->nr_tabstops) { + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 32); + } + + prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); + + trans_for_each_path_safe(trans, path, idx) { + if (!path->nodes_locked) + continue; + + prt_printf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', + path->level, + bch2_btree_id_str(path->btree_id)); + bch2_bpos_to_text(out, path->pos); + prt_newline(out); + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + if (btree_node_locked(path, l) && + !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { + prt_printf(out, " %c l=%u ", + lock_types[btree_node_locked_type(path, l)], l); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); + } + } + } + + b = READ_ONCE(trans->locking); + if (b) { + prt_printf(out, " blocked for %lluus on", + div_u64(local_clock() - trans->locking_wait.start_time, + 1000)); + prt_newline(out); + prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); + } +} + +void bch2_fs_btree_iter_exit(struct bch_fs *c) +{ + struct btree_transaction_stats *s; + struct btree_trans *trans; + int cpu; + + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); + if (trans) + panic("%s leaked btree_trans\n", trans->fn); + + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); + free_percpu(c->btree_trans_bufs); + + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) { + kfree(s->max_paths_text); + bch2_time_stats_exit(&s->lock_hold_times); + } + + if (c->btree_trans_barrier_initialized) + cleanup_srcu_struct(&c->btree_trans_barrier); + mempool_exit(&c->btree_trans_mem_pool); + mempool_exit(&c->btree_trans_pool); +} + +void bch2_fs_btree_iter_init_early(struct bch_fs *c) +{ + struct btree_transaction_stats *s; + + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) { + bch2_time_stats_init(&s->lock_hold_times); + mutex_init(&s->lock); + } + + INIT_LIST_HEAD(&c->btree_trans_list); + seqmutex_init(&c->btree_trans_lock); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) +{ + int ret; + + c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); + if (!c->btree_trans_bufs) + return -ENOMEM; + + ret = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1, + sizeof(struct btree_trans)) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, + BTREE_TRANS_MEM_MAX) ?: + init_srcu_struct(&c->btree_trans_barrier); + if (!ret) + c->btree_trans_barrier_initialized = true; + return ret; +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 index 000000000000..eaffced4c132 --- /dev/null +++ b/fs/bcachefs/btree_iter.h @@ -0,0 +1,944 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H + +#include "bset.h" +#include "btree_types.h" +#include "trace.h" + +static inline int __bkey_err(const struct bkey *k) +{ + return PTR_ERR_OR_ZERO(k); +} + +#define bkey_err(_k) __bkey_err((_k).k) + +static inline void __btree_path_get(struct btree_path *path, bool intent) +{ + path->ref++; + path->intent_ref += intent; +} + +static inline bool __btree_path_put(struct btree_path *path, bool intent) +{ + EBUG_ON(!path->ref); + EBUG_ON(!path->intent_ref && intent); + path->intent_ref -= intent; + return --path->ref == 0; +} + +static inline void btree_path_set_dirty(struct btree_path *path, + enum btree_path_uptodate u) +{ + path->uptodate = max_t(unsigned, path->uptodate, u); +} + +static inline struct btree *btree_path_node(struct btree_path *path, + unsigned level) +{ + return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; +} + +static inline bool btree_node_lock_seq_matches(const struct btree_path *path, + const struct btree *b, unsigned level) +{ + return path->l[level].lock_seq == six_lock_seq(&b->c.lock); +} + +static inline struct btree *btree_node_parent(struct btree_path *path, + struct btree *b) +{ + return btree_path_node(path, b->c.level + 1); +} + +/* Iterate over paths within a transaction: */ + +void __bch2_btree_trans_sort_paths(struct btree_trans *); + +static inline void btree_trans_sort_paths(struct btree_trans *trans) +{ + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + trans->paths_sorted) + return; + __bch2_btree_trans_sort_paths(trans); +} + +static inline struct btree_path * +__trans_next_path(struct btree_trans *trans, unsigned idx) +{ + u64 l; + + if (idx == BTREE_ITER_MAX) + return NULL; + + l = trans->paths_allocated >> idx; + if (!l) + return NULL; + + idx += __ffs64(l); + EBUG_ON(idx >= BTREE_ITER_MAX); + EBUG_ON(trans->paths[idx].idx != idx); + return &trans->paths[idx]; +} + +#define trans_for_each_path_from(_trans, _path, _start) \ + for (_path = __trans_next_path((_trans), _start); \ + (_path); \ + _path = __trans_next_path((_trans), (_path)->idx + 1)) + +#define trans_for_each_path(_trans, _path) \ + trans_for_each_path_from(_trans, _path, 0) + +static inline struct btree_path * +__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) +{ + u64 l; + + if (*idx == BTREE_ITER_MAX) + return NULL; + + l = trans->paths_allocated >> *idx; + if (!l) + return NULL; + + *idx += __ffs64(l); + EBUG_ON(*idx >= BTREE_ITER_MAX); + return &trans->paths[*idx]; +} + +/* + * This version is intended to be safe for use on a btree_trans that is owned by + * another thread, for bch2_btree_trans_to_text(); + */ +#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ + for (_idx = _start; \ + (_path = __trans_next_path_safe((_trans), &_idx)); \ + _idx++) + +#define trans_for_each_path_safe(_trans, _path, _idx) \ + trans_for_each_path_safe_from(_trans, _path, _idx, 0) + +static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) +{ + unsigned idx = path ? path->sorted_idx + 1 : 0; + + EBUG_ON(idx > trans->nr_sorted); + + return idx < trans->nr_sorted + ? trans->paths + trans->sorted[idx] + : NULL; +} + +static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) +{ + unsigned idx = path ? path->sorted_idx : trans->nr_sorted; + + return idx + ? trans->paths + trans->sorted[idx - 1] + : NULL; +} + +#define trans_for_each_path_inorder(_trans, _path, _i) \ + for (_i = 0; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ + _i++) + +#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ + for (_i = trans->nr_sorted - 1; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ + --_i) + +static inline bool __path_has_node(const struct btree_path *path, + const struct btree *b) +{ + return path->l[b->c.level].b == b && + btree_node_lock_seq_matches(path, b, b->c.level); +} + +static inline struct btree_path * +__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, + unsigned idx) +{ + struct btree_path *path = __trans_next_path(trans, idx); + + while (path && !__path_has_node(path, b)) + path = __trans_next_path(trans, path->idx + 1); + + return path; +} + +#define trans_for_each_path_with_node(_trans, _b, _path) \ + for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ + (_path); \ + _path = __trans_next_path_with_node((_trans), (_b), \ + (_path)->idx + 1)) + +struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); + +static inline struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ + if (path->ref > 1 || path->preserve) + path = __bch2_btree_path_make_mut(trans, path, intent, ip); + path->should_be_locked = false; + return path; +} + +struct btree_path * __must_check +__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, + struct bpos, bool, unsigned long, int); + +static inline struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent, unsigned long ip) +{ + int cmp = bpos_cmp(new_pos, path->pos); + + return cmp + ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) + : path; +} + +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); + +static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) +{ + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + + return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); +} + +int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); +struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, + unsigned, unsigned, unsigned, unsigned long); +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); + +/* + * bch2_btree_path_peek_slot() for a cached iterator might return a key in a + * different snapshot: + */ +static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) +{ + struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); + + if (k.k && bpos_eq(path->pos, k.k->p)) + return k; + + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + +struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, + struct btree_iter *, struct bpos); + +void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); + +int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *); + +static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) +{ + return mutex_trylock(lock) + ? 0 + : __bch2_trans_mutex_lock(trans, lock); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_trans_verify_paths(struct btree_trans *); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, + struct bpos, bool); +#else +static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) {} +#endif + +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *, struct bkey_packed *); +void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_packed *, unsigned, unsigned); + +int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); + +void bch2_path_put(struct btree_trans *, struct btree_path *, bool); + +int bch2_trans_relock(struct btree_trans *); +int bch2_trans_relock_notrace(struct btree_trans *); +void bch2_trans_unlock(struct btree_trans *); +void bch2_trans_unlock_long(struct btree_trans *); +bool bch2_trans_locked(struct btree_trans *); + +static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count) +{ + return restart_count != trans->restart_count + ? -BCH_ERR_transaction_restart_nested + : 0; +} + +void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); + +static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, + u32 restart_count) +{ + if (trans_was_restarted(trans, restart_count)) + bch2_trans_restart_error(trans, restart_count); +} + +void __noreturn bch2_trans_in_restart_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +{ + if (trans->restarted) + bch2_trans_in_restart_error(trans); +} + +__always_inline +static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) +{ + BUG_ON(err <= 0); + BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); + + trans->restarted = err; + trans->last_restarted_ip = _THIS_IP_; + return -err; +} + +__always_inline +static int btree_trans_restart(struct btree_trans *trans, int err) +{ + btree_trans_restart_nounlock(trans, err); + return -err; +} + +bool bch2_btree_node_upgrade(struct btree_trans *, + struct btree_path *, unsigned); + +void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); + +static inline void bch2_btree_path_downgrade(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned new_locks_want = path->level + !!path->intent_ref; + + if (path->locks_want > new_locks_want) + __bch2_btree_path_downgrade(trans, path, new_locks_want); +} + +void bch2_trans_downgrade(struct btree_trans *); + +void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); + +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); +int __must_check bch2_btree_iter_traverse(struct btree_iter *); + +struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); + +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + return bch2_btree_iter_peek_upto(iter, SPOS_MAX); +} + +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + +bool bch2_btree_iter_advance(struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_iter *); + +static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + iter->k.type = KEY_TYPE_deleted; + iter->k.p.inode = iter->pos.inode = new_pos.inode; + iter->k.p.offset = iter->pos.offset = new_pos.offset; + iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; + iter->k.size = 0; +} + +static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + if (unlikely(iter->update_path)) + bch2_path_put(iter->trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + + __bch2_btree_iter_set_pos(iter, new_pos); +} + +static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) +{ + BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); + iter->pos = bkey_start_pos(&iter->k); +} + +static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) +{ + struct bpos pos = iter->pos; + + iter->snapshot = snapshot; + pos.snapshot = snapshot; + bch2_btree_iter_set_pos(iter, pos); +} + +void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); + +static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned flags) +{ + if (flags & BTREE_ITER_ALL_LEVELS) + flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + btree_id_is_extents(btree_id)) + flags |= BTREE_ITER_IS_EXTENTS; + + if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshot_field(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; + + if (trans->journal_replay_not_finished) + flags |= BTREE_ITER_WITH_JOURNAL; + + return flags; +} + +static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned flags) +{ + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + + return __bch2_btree_iter_flags(trans, btree_id, flags); +} + +static inline void bch2_trans_iter_init_common(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) +{ + memset(iter, 0, sizeof(*iter)); + iter->trans = trans; + iter->btree_id = btree_id; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k.p = pos; + +#ifdef CONFIG_BCACHEFS_DEBUG + iter->ip_allocated = ip; +#endif + iter->path = bch2_path_get(trans, btree_id, iter->pos, + locks_want, depth, flags, ip); +} + +void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos, unsigned); + +static inline void bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + if (__builtin_constant_p(btree_id) && + __builtin_constant_p(flags)) + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, flags), + _THIS_IP_); + else + bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); +} + +void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); +void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); + +static inline void set_btree_iter_dontneed(struct btree_iter *iter) +{ + if (!iter->trans->restarted) + iter->path->preserve = false; +} + +void *__bch2_trans_kmalloc(struct btree_trans *, size_t); + +static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + size = roundup(size, 8); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; + memset(p, 0, size); + return p; + } else { + return __bch2_trans_kmalloc(trans, size); + } +} + +static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + size = roundup(size, 8); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; + return p; + } else { + return __bch2_trans_kmalloc(trans, size); + } +} + +static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type) +{ + struct bkey_s_c k; + + bch2_trans_iter_init(trans, iter, btree_id, pos, flags); + k = bch2_btree_iter_peek_slot(iter); + + if (!bkey_err(k) && type && k.k->type != type) + k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); + if (unlikely(bkey_err(k))) + bch2_trans_iter_exit(trans, iter); + return k; +} + +static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); +} + +#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ + bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ + _btree_id, _pos, _flags, KEY_TYPE_##_type)) + +static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, + unsigned val_size, void *val) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); + ret = bkey_err(k); + if (!ret) { + unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); + + memcpy(val, k.v, b); + if (unlikely(b < sizeof(*val))) + memset((void *) val + b, 0, sizeof(*val) - b); + bch2_trans_iter_exit(trans, &iter); + } + + return ret; +} + +#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\ + __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(*_val), _val) + +void bch2_trans_srcu_unlock(struct btree_trans *); +void bch2_trans_srcu_lock(struct btree_trans *); + +u32 bch2_trans_begin(struct btree_trans *); + +/* + * XXX + * this does not handle transaction restarts from bch2_btree_iter_next_node() + * correctly + */ +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _ret) \ + for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ + !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ + (_b) = bch2_btree_iter_next_node(&(_iter))) + +#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _flags, _b, _ret) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _ret) + +static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, + unsigned flags) +{ + BUG_ON(flags & BTREE_ITER_ALL_LEVELS); + + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek_prev(iter); +} + +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, + unsigned flags) +{ + return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : + flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek(iter); +} + +static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, + struct bpos end, + unsigned flags) +{ + if (!(flags & BTREE_ITER_SLOTS)) + return bch2_btree_iter_peek_upto(iter, end); + + if (bkey_gt(iter->pos, end)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +} + +static inline int btree_trans_too_many_iters(struct btree_trans *trans) +{ + if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { + trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); + } + + return 0; +} + +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); + +static inline struct bkey_s_c +__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(iter, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + +static inline struct bkey_s_c +__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + unsigned flags) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_upto_type(iter, end, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + +#define lockrestart_do(_trans, _do) \ +({ \ + u32 _restart_count; \ + int _ret2; \ + \ + do { \ + _restart_count = bch2_trans_begin(_trans); \ + _ret2 = (_do); \ + } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \ + \ + if (!_ret2) \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + \ + _ret2; \ +}) + +/* + * nested_lockrestart_do(), nested_commit_do(): + * + * These are like lockrestart_do() and commit_do(), with two differences: + * + * - We don't call bch2_trans_begin() unless we had a transaction restart + * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a + * transaction restart + */ +#define nested_lockrestart_do(_trans, _do) \ +({ \ + u32 _restart_count, _orig_restart_count; \ + int _ret2; \ + \ + _restart_count = _orig_restart_count = (_trans)->restart_count; \ + \ + while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\ + _restart_count = bch2_trans_begin(_trans); \ + \ + if (!_ret2) \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + \ + _ret2 ?: trans_was_restarted(_trans, _restart_count); \ +}) + +#define for_each_btree_key2(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +({ \ + int _ret3 = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + \ + _ret3 = 0; \ + (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ + if (!(_k).k) \ + break; \ + \ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret3) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_advance(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) + +#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + int _ret3 = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + \ + _ret3 = 0; \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ + if (!(_k).k) \ + break; \ + \ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret3) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_advance(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) + +#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +({ \ + int _ret3 = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ + if (!(_k).k) { \ + _ret3 = 0; \ + break; \ + } \ + \ + _ret3 = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret3) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_rewind(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) + +#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ + _start, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ + _start, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ + _start, _end, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \ + &(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ + for (; \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for (; \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ + for (; \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define drop_locks_do(_trans, _do) \ +({ \ + bch2_trans_unlock(_trans); \ + _do ?: bch2_trans_relock(_trans); \ +}) + +#define allocate_dropping_locks_errcode(_trans, _do) \ +({ \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + int _ret = _do; \ + \ + if (bch2_err_matches(_ret, ENOMEM)) { \ + _gfp = GFP_KERNEL; \ + _ret = drop_locks_do(trans, _do); \ + } \ + _ret; \ +}) + +#define allocate_dropping_locks(_trans, _ret, _do) \ +({ \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + typeof(_do) _p = _do; \ + \ + _ret = 0; \ + if (unlikely(!_p)) { \ + _gfp = GFP_KERNEL; \ + _ret = drop_locks_do(trans, ((_p = _do), 0)); \ + } \ + _p; \ +}) + +/* new multiple iterator interface: */ + +void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); +void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); +void bch2_dump_trans_updates(struct btree_trans *); +void bch2_dump_trans_paths_updates(struct btree_trans *); + +struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); +void bch2_trans_put(struct btree_trans *); + +extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; +unsigned bch2_trans_get_fn_idx(const char *); + +#define bch2_trans_get(_c) \ +({ \ + static unsigned trans_fn_idx; \ + \ + if (unlikely(!trans_fn_idx)) \ + trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ + __bch2_trans_get(_c, trans_fn_idx); \ +}) + +void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); + +void bch2_fs_btree_iter_exit(struct bch_fs *); +void bch2_fs_btree_iter_init_early(struct bch_fs *); +int bch2_fs_btree_iter_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c new file mode 100644 index 000000000000..ec52f50d249d --- /dev/null +++ b/fs/bcachefs/btree_journal_iter.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bset.h" +#include "btree_journal_iter.h" +#include "journal_io.h" + +#include <linux/sort.h> + +/* + * For managing keys we read from the journal: until journal replay works normal + * btree lookups need to be able to find and return keys from the journal where + * they overwrite what's in the btree, so we have a special iterator and + * operations for the regular btree iter code to use: + */ + +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + const struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); +} + +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +{ + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); +} + +static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) +{ + size_t gap_size = keys->size - keys->nr; + + if (idx >= keys->gap) + idx += gap_size; + return idx; +} + +static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) +{ + return keys->d + idx_to_pos(keys, idx); +} + +static size_t __bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + size_t l = 0, r = keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); + if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) + l = m + 1; + else + r = m; + } + + BUG_ON(l < keys->nr && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); + + BUG_ON(l && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + + return l; +} + +static size_t bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); +} + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; + + BUG_ON(*idx > keys->nr); +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) + return NULL; + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && + !k->overwritten) + return k->k; + + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + return NULL; +} + +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) +{ + size_t idx = 0; + + return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iters_fix(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + /* The key we just inserted is immediately before the gap: */ + size_t gap_end = keys->gap + (keys->size - keys->nr); + struct btree_and_journal_iter *iter; + + /* + * If an iterator points one after the key we just inserted, decrement + * the iterator so it points at the key we just inserted - if the + * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will + * handle that: + */ + list_for_each_entry(iter, &c->journal_iters, journal.list) + if (iter->journal.idx == gap_end) + iter->journal.idx = keys->gap - 1; +} + +static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + size_t gap_size = keys->size - keys->nr; + + list_for_each_entry(iter, &c->journal_iters, list) { + if (iter->idx > old_gap) + iter->idx -= gap_size; + if (iter->idx >= new_gap) + iter->idx += gap_size; + } +} + +int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + + if (idx < keys->size && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } + + if (idx > keys->gap) + idx -= keys->size - keys->nr; + + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, + .size = max_t(size_t, keys->size, 8) * 2, + }; + + new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); + if (!new_keys.d) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); + return -BCH_ERR_ENOMEM_journal_key_insert; + } + + /* Since @keys was full, there was no gap: */ + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + keys->d = new_keys.d; + keys->nr = new_keys.nr; + keys->size = new_keys.size; + + /* And now the gap is at the end: */ + keys->gap = keys->nr; + } + + journal_iters_move_gap(c, keys->gap, idx); + + move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); + keys->gap = idx; + + keys->nr++; + keys->d[keys->gap++] = n; + + journal_iters_fix(c); + + return 0; +} + +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct bkey_i *n; + int ret; + + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_journal_key_insert; + + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); + if (ret) + kfree(n); + return ret; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i whiteout; + + bkey_init(&whiteout.k); + whiteout.k.p = pos; + + return bch2_journal_key_insert(c, id, level, &whiteout); +} + +void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->size && + keys->d[idx].btree_id == btree && + keys->d[idx].level == level && + bpos_eq(keys->d[idx].k->k.p, pos)) + keys->d[idx].overwritten = true; +} + +static void bch2_journal_iter_advance(struct journal_iter *iter) +{ + if (iter->idx < iter->keys->size) { + iter->idx++; + if (iter->idx == iter->keys->gap) + iter->idx += iter->keys->size - iter->keys->nr; + } +} + +static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +{ + struct journal_key *k = iter->keys->d + iter->idx; + + while (k < iter->keys->d + iter->keys->size && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return bkey_i_to_s_c(k->k); + + bch2_journal_iter_advance(iter); + k = iter->keys->d + iter->idx; + } + + return bkey_s_c_null; +} + +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); +} + +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, + enum btree_id id, unsigned level, + struct bpos pos) +{ + iter->btree_id = id; + iter->level = level; + iter->keys = &c->journal_keys; + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +{ + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); +} + +static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) +{ + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); +} + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +{ + if (bpos_eq(iter->pos, SPOS_MAX)) + iter->at_end = true; + else + iter->pos = bpos_successor(iter->pos); +} + +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) +{ + struct bkey_s_c btree_k, journal_k, ret; +again: + if (iter->at_end) + return bkey_s_c_null; + + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && + bpos_lt(btree_k.k->p, iter->pos)) + bch2_journal_iter_advance_btree(iter); + + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); + + ret = journal_k.k && + (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) + ? journal_k + : btree_k; + + if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) + ret = bkey_s_c_null; + + if (ret.k) { + iter->pos = ret.k->p; + if (bkey_deleted(ret.k)) { + bch2_btree_and_journal_iter_advance(iter); + goto again; + } + } else { + iter->pos = SPOS_MAX; + iter->at_end = true; + } + + return ret; +} + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) +{ + bch2_journal_iter_exit(&iter->journal); +} + +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) +{ + memset(iter, 0, sizeof(*iter)); + + iter->b = b; + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); + iter->pos = b->data->min_key; + iter->at_end = false; +} + +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) +{ + struct btree_node_iter node_iter; + + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); +} + +/* sort and dedup all keys in the journal: */ + +void bch2_journal_entries_free(struct bch_fs *c) +{ + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + if (*i) + kvpfree(*i, offsetof(struct journal_replay, j) + + vstruct_bytes(&(*i)->j)); + genradix_free(&c->journal_entries); +} + +/* + * When keys compare equal, oldest compares first: + */ +static int journal_sort_key_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return journal_key_cmp(l, r) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +void bch2_journal_keys_put(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_key *i; + + BUG_ON(atomic_read(&keys->ref) <= 0); + + if (!atomic_dec_and_test(&keys->ref)) + return; + + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); + + kvfree(keys->d); + keys->d = NULL; + keys->nr = keys->gap = keys->size = 0; + + bch2_journal_entries_free(c); +} + +static void __journal_keys_sort(struct journal_keys *keys) +{ + struct journal_key *src, *dst; + + sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + + src = dst = keys->d; + while (src < keys->d + keys->nr) { + while (src + 1 < keys->d + keys->nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && + bpos_eq(src[0].k->k.p, src[1].k->k.p)) + src++; + + *dst++ = *src++; + } + + keys->nr = dst - keys->d; +} + +int bch2_journal_keys_sort(struct bch_fs *c) +{ + struct genradix_iter iter; + struct journal_replay *i, **_i; + struct jset_entry *entry; + struct bkey_i *k; + struct journal_keys *keys = &c->journal_keys; + size_t nr_keys = 0, nr_read = 0; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + for_each_jset_key(k, entry, &i->j) + nr_keys++; + } + + if (!nr_keys) + return 0; + + keys->size = roundup_pow_of_two(nr_keys); + + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + if (!keys->d) { + bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", + nr_keys); + + do { + keys->size >>= 1; + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + } while (!keys->d && keys->size > nr_keys / 8); + + if (!keys->d) { + bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", + keys->size); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + cond_resched(); + + for_each_jset_key(k, entry, &i->j) { + if (keys->nr == keys->size) { + __journal_keys_sort(keys); + + if (keys->nr > keys->size * 7 / 8) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", + keys->nr, keys->size, nr_read, nr_keys); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + keys->d[keys->nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + nr_read++; + } + } + + __journal_keys_sort(keys); + keys->gap = keys->nr; + + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + return 0; +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h new file mode 100644 index 000000000000..8ca4c100b2e3 --- /dev/null +++ b/fs/bcachefs/btree_journal_iter.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H +#define _BCACHEFS_BTREE_JOURNAL_ITER_H + +struct journal_iter { + struct list_head list; + enum btree_id btree_id; + unsigned level; + size_t idx; + struct journal_keys *keys; +}; + +/* + * Iterate over keys in the btree, with keys from the journal overlaid on top: + */ + +struct btree_and_journal_iter { + struct btree *b; + struct btree_node_iter node_iter; + struct bkey unpacked; + + struct journal_iter journal; + struct bpos pos; + bool at_end; +}; + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + +void bch2_journal_keys_put(struct bch_fs *); + +static inline void bch2_journal_keys_put_initial(struct bch_fs *c) +{ + if (c->journal_keys.initial_ref_held) + bch2_journal_keys_put(c); + c->journal_keys.initial_ref_held = false; +} + +void bch2_journal_entries_free(struct bch_fs *); + +int bch2_journal_keys_sort(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 index 000000000000..1b7a5668df7c --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c @@ -0,0 +1,1074 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "trace.h" + +#include <linux/sched/mm.h> + +static inline bool btree_uses_pcpu_readers(enum btree_id id) +{ + return id == BTREE_ID_subvolumes; +} + +static struct kmem_cache *bch2_key_cache; + +static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct bkey_cached *ck = obj; + const struct bkey_cached_key *key = arg->key; + + return ck->key.btree_id != key->btree_id || + !bpos_eq(ck->key.pos, key->pos); +} + +static const struct rhashtable_params bch2_btree_key_cache_params = { + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, +}; + +__flatten +inline struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) +{ + struct bkey_cached_key key = { + .btree_id = btree_id, + .pos = pos, + }; + + return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, + bch2_btree_key_cache_params); +} + +static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) +{ + if (!six_trylock_intent(&ck->c.lock)) + return false; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + six_unlock_intent(&ck->c.lock); + return false; + } + + if (!six_trylock_write(&ck->c.lock)) { + six_unlock_intent(&ck->c.lock); + return false; + } + + return true; +} + +static void bkey_cached_evict(struct btree_key_cache *c, + struct bkey_cached *ck) +{ + BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params)); + memset(&ck->key, ~0, sizeof(ck->key)); + + atomic_long_dec(&c->nr_keys); +} + +static void bkey_cached_free(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + if (ck->c.lock.readers) { + list_move_tail(&ck->list, &bc->freed_pcpu); + bc->nr_freed_pcpu++; + } else { + list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; + } + atomic_long_inc(&bc->nr_freed); + + kfree(ck->k); + ck->k = NULL; + ck->u64s = 0; + + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); +} + +#ifdef __KERNEL__ +static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bkey_cached *pos; + + bc->nr_freed_nonpcpu++; + + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { + if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, + pos->btree_trans_barrier_seq)) { + list_move(&ck->list, &pos->list); + return; + } + } + + list_move(&ck->list, &bc->freed_nonpcpu); +} +#endif + +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + if (!ck->c.lock.readers) { +#ifdef __KERNEL__ + struct btree_key_cache_freelist *f; + bool freed = false; + + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + if (f->nr < ARRAY_SIZE(f->objs)) { + f->objs[f->nr++] = ck; + freed = true; + } + preempt_enable(); + + if (!freed) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (f->nr > ARRAY_SIZE(f->objs) / 2) { + struct bkey_cached *ck2 = f->objs[--f->nr]; + + __bkey_cached_move_to_freelist_ordered(bc, ck2); + } + preempt_enable(); + + __bkey_cached_move_to_freelist_ordered(bc, ck); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_nonpcpu); + bc->nr_freed_nonpcpu++; + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_pcpu); + mutex_unlock(&bc->lock); + } +} + +static void bkey_cached_free_fast(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + list_del_init(&ck->list); + atomic_long_inc(&bc->nr_freed); + + kfree(ck->k); + ck->k = NULL; + ck->u64s = 0; + + bkey_cached_move_to_freelist(bc, ck); + + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); +} + +static struct bkey_cached * +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, + bool *was_new) +{ + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck = NULL; + bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); + int ret; + + if (!pcpu_readers) { +#ifdef __KERNEL__ + struct btree_key_cache_freelist *f; + + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + if (f->nr) + ck = f->objs[--f->nr]; + preempt_enable(); + + if (!ck) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (!list_empty(&bc->freed_nonpcpu) && + f->nr < ARRAY_SIZE(f->objs) / 2) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; + f->objs[f->nr++] = ck; + } + + ck = f->nr ? f->objs[--f->nr] : NULL; + preempt_enable(); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_nonpcpu)) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + bc->nr_freed_nonpcpu--; + } + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_pcpu)) { + ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); + list_del_init(&ck->list); + } + mutex_unlock(&bc->lock); + } + + if (ck) { + ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); + if (unlikely(ret)) { + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + path->l[0].b = (void *) ck; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); + + ret = bch2_btree_node_lock_write(trans, path, &ck->c); + if (unlikely(ret)) { + btree_node_unlock(trans, path, 0); + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + return ck; + } + + ck = allocate_dropping_locks(trans, ret, + kmem_cache_zalloc(bch2_key_cache, _gfp)); + if (ret) { + kmem_cache_free(bch2_key_cache, ck); + return ERR_PTR(ret); + } + + if (!ck) + return NULL; + + INIT_LIST_HEAD(&ck->list); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + + ck->c.cached = true; + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + *was_new = true; + return ck; +} + +static struct bkey_cached * +bkey_cached_reuse(struct btree_key_cache *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct bkey_cached *ck; + unsigned i; + + mutex_lock(&c->lock); + rcu_read_lock(); + tbl = rht_dereference_rcu(c->table.tbl, &c->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(c, ck); + goto out; + } + } + ck = NULL; +out: + rcu_read_unlock(); + mutex_unlock(&c->lock); + return ck; +} + +static struct bkey_cached * +btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck; + bool was_new = false; + + ck = bkey_cached_alloc(trans, path, &was_new); + if (IS_ERR(ck)) + return ck; + + if (unlikely(!ck)) { + ck = bkey_cached_reuse(bc); + if (unlikely(!ck)) { + bch_err(c, "error allocating memory for key cache item, btree %s", + bch2_btree_id_str(path->btree_id)); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); + } + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); + } + + ck->c.level = 0; + ck->c.btree_id = path->btree_id; + ck->key.btree_id = path->btree_id; + ck->key.pos = path->pos; + ck->valid = false; + ck->flags = 1U << BKEY_CACHED_ACCESSED; + + if (unlikely(rhashtable_lookup_insert_fast(&bc->table, + &ck->hash, + bch2_btree_key_cache_params))) { + /* We raced with another fill: */ + + if (likely(was_new)) { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + kfree(ck); + } else { + bkey_cached_free_fast(bc, ck); + } + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); + return NULL; + } + + atomic_long_inc(&bc->nr_keys); + + six_unlock_write(&ck->c.lock); + + return ck; +} + +static int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_cached *ck) +{ + struct btree_iter iter; + struct bkey_s_c k; + unsigned new_u64s = 0; + struct bkey_i *new_k = NULL; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); + goto err; + } + + /* + * bch2_varint_decode can read past the end of the buffer by at + * most 7 bytes (it won't be used): + */ + new_u64s = k.k->u64s + 1; + + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + new_u64s = min(256U, (new_u64s * 3) / 2); + + if (new_u64s > ck->u64s) { + new_u64s = roundup_pow_of_two(new_u64s); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); + if (!new_k) { + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(ck->key.btree_id), new_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + goto err; + } + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_k); + goto err; + } + } + } + + ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); + if (ret) { + kfree(new_k); + goto err; + } + + if (new_k) { + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + } + + bkey_reassemble(ck->k, k); + ck->valid = true; + bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); + + /* We're not likely to need this iterator again: */ + set_btree_iter_dontneed(&iter); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline int +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + + BUG_ON(path->level); + + path->l[1].b = NULL; + + if (bch2_btree_node_relock_notrace(trans, path, 0)) { + ck = (void *) path->l[0].b; + goto fill; + } +retry: + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { + ck = btree_key_cache_create(trans, path); + ret = PTR_ERR_OR_ZERO(ck); + if (ret) + goto err; + if (!ck) + goto retry; + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); + path->locks_want = 1; + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + BUG_ON(ret); + + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); + } + + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; +fill: + path->uptodate = BTREE_ITER_UPTODATE; + + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + /* + * Using the underscore version because we haven't set + * path->uptodate yet: + */ + if (!path->locks_want && + !__bch2_btree_path_upgrade(trans, path, 1, NULL)) { + trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); + goto err; + } + + ret = btree_key_cache_fill(trans, path, ck); + if (ret) + goto err; + + ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (ret) + goto err; + + path->uptodate = BTREE_ITER_UPTODATE; + } + + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + BUG_ON(path->uptodate); + + return ret; +err: + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); + } + return ret; +} + +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + + EBUG_ON(path->level); + + path->l[1].b = NULL; + + if (bch2_btree_node_relock_notrace(trans, path, 0)) { + ck = (void *) path->l[0].b; + goto fill; + } +retry: + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret) + return ret; + + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); + } + + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; +fill: + if (!ck->valid) + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + path->uptodate = BTREE_ITER_UPTODATE; + EBUG_ON(!ck->valid); + EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + + return ret; +} + +static int btree_key_cache_flush_pos(struct btree_trans *trans, + struct bkey_cached_key key, + u64 journal_seq, + unsigned commit_flags, + bool evict) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree_iter c_iter, b_iter; + struct bkey_cached *ck = NULL; + int ret; + + bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT| + BTREE_ITER_ALL_SNAPSHOTS); + bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + + ret = bch2_btree_iter_traverse(&c_iter); + if (ret) + goto out; + + ck = (void *) c_iter.path->l[0].b; + if (!ck) + goto out; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + if (evict) + goto evict; + goto out; + } + + BUG_ON(!ck->valid); + + if (journal_seq && ck->journal.seq != journal_seq) + goto out; + + /* + * Since journal reclaim depends on us making progress here, and the + * allocator/copygc depend on journal reclaim making progress, we need + * to be using alloc reserves: + */ + ret = bch2_btree_iter_traverse(&b_iter) ?: + bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_KEY_CACHE_RECLAIM| + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + (ck->journal.seq == journal_last_seq(j) + ? BCH_WATERMARK_reclaim + : 0)| + commit_flags); + + bch2_fs_fatal_err_on(ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && + !bch2_journal_error(j), c, + "error flushing key cache: %s", bch2_err_str(ret)); + if (ret) + goto out; + + bch2_journal_pin_drop(j, &ck->journal); + + BUG_ON(!btree_node_locked(c_iter.path, 0)); + + if (!evict) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + } + } else { + struct btree_path *path2; +evict: + trans_for_each_path(trans, path2) + if (path2 != c_iter.path) + __bch2_btree_path_unlock(trans, path2); + + bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + } + + mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); + bkey_cached_evict(&c->btree_key_cache, ck); + bkey_cached_free_fast(&c->btree_key_cache, ck); + } +out: + bch2_trans_iter_exit(trans, &b_iter); + bch2_trans_iter_exit(trans, &c_iter); + return ret; +} + +int bch2_btree_key_cache_journal_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bkey_cached *ck = + container_of(pin, struct bkey_cached, journal); + struct bkey_cached_key key; + struct btree_trans *trans = bch2_trans_get(c); + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + int ret = 0; + + btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); + key = ck->key; + + if (ck->journal.seq != seq || + !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + six_unlock_read(&ck->c.lock); + goto unlock; + } + + if (ck->seq != seq) { + bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, + bch2_btree_key_cache_journal_flush); + six_unlock_read(&ck->c.lock); + goto unlock; + } + six_unlock_read(&ck->c.lock); + + ret = commit_do(trans, NULL, NULL, 0, + btree_key_cache_flush_pos(trans, key, seq, + BTREE_INSERT_JOURNAL_RECLAIM, false)); +unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + + bch2_trans_put(trans); + return ret; +} + +/* + * Flush and evict a key from the key cache: + */ +int bch2_btree_key_cache_flush(struct btree_trans *trans, + enum btree_id id, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct bkey_cached_key key = { id, pos }; + + /* Fastpath - assume it won't be found: */ + if (!bch2_btree_key_cache_find(c, id, pos)) + return 0; + + return btree_key_cache_flush_pos(trans, key, 0, 0, true); +} + +bool bch2_btree_insert_key_cached(struct btree_trans *trans, + unsigned flags, + struct btree_insert_entry *insert_entry) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_i *insert = insert_entry->k; + bool kick_reclaim = false; + + BUG_ON(insert->k.u64s > ck->u64s); + + bkey_copy(ck->k, insert); + ck->valid = true; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_inc(&c->btree_key_cache.nr_dirty); + + if (bch2_nr_btree_keys_need_flush(c)) + kick_reclaim = true; + } + + /* + * To minimize lock contention, we only add the journal pin here and + * defer pin updates to the flush callback via ->seq. Be careful not to + * update ->seq on nojournal commits because we don't want to update the + * pin to a seq that doesn't include journal updates on disk. Otherwise + * we risk losing the update after a crash. + * + * The only exception is if the pin is not active in the first place. We + * have to add the pin because journal reclaim drives key cache + * flushing. The flush callback will not proceed unless ->seq matches + * the latest pin, so make sure it starts with a consistent value. + */ + if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + !journal_pin_active(&ck->journal)) { + ck->seq = trans->journal_res.seq; + } + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + &ck->journal, bch2_btree_key_cache_journal_flush); + + if (kick_reclaim) + journal_reclaim_kick(&c->journal); + return true; +} + +void bch2_btree_key_cache_drop(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + + BUG_ON(!ck->valid); + + /* + * We just did an update to the btree, bypassing the key cache: the key + * cache key is now stale and must be dropped, even if dirty: + */ + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + bch2_journal_pin_drop(&c->journal, &ck->journal); + } + + ck->valid = false; +} + +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = shrink->private_data; + struct btree_key_cache *bc = &c->btree_key_cache; + struct bucket_table *tbl; + struct bkey_cached *ck, *t; + size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; + unsigned start, flags; + int srcu_idx; + + mutex_lock(&bc->lock); + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + flags = memalloc_nofs_save(); + + /* + * Newest freed entries are at the end of the list - once we hit one + * that's too new to be freed, we can bail out: + */ + scanned += bc->nr_freed_nonpcpu; + + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + six_lock_exit(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + atomic_long_dec(&bc->nr_freed); + freed++; + bc->nr_freed_nonpcpu--; + } + + if (scanned >= nr) + goto out; + + scanned += bc->nr_freed_pcpu; + + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + six_lock_exit(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + atomic_long_dec(&bc->nr_freed); + freed++; + bc->nr_freed_pcpu--; + } + + if (scanned >= nr) + goto out; + + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + start = bc->shrink_iter; + + do { + struct rhash_head *pos, *next; + + pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + + while (!rht_is_a_nulls(pos)) { + next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + ck = container_of(pos, struct bkey_cached, hash); + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) + goto next; + + if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + else if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + + scanned++; + if (scanned >= nr) + break; +next: + pos = next; + } + + bc->shrink_iter++; + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + } while (scanned < nr && bc->shrink_iter != start); + + rcu_read_unlock(); +out: + memalloc_nofs_restore(flags); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + mutex_unlock(&bc->lock); + + return freed; +} + +static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = shrink->private_data; + struct btree_key_cache *bc = &c->btree_key_cache; + long nr = atomic_long_read(&bc->nr_keys) - + atomic_long_read(&bc->nr_dirty); + + return max(0L, nr); +} + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bucket_table *tbl; + struct bkey_cached *ck, *n; + struct rhash_head *pos; + LIST_HEAD(items); + unsigned i; +#ifdef __KERNEL__ + int cpu; +#endif + + shrinker_free(bc->shrink); + + mutex_lock(&bc->lock); + + /* + * The loop is needed to guard against racing with rehash: + */ + while (atomic_long_read(&bc->nr_keys)) { + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (tbl) + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &items); + } + rcu_read_unlock(); + } + +#ifdef __KERNEL__ + for_each_possible_cpu(cpu) { + struct btree_key_cache_freelist *f = + per_cpu_ptr(bc->pcpu_freed, cpu); + + for (i = 0; i < f->nr; i++) { + ck = f->objs[i]; + list_add(&ck->list, &items); + } + } +#endif + + BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); + BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); + + list_splice(&bc->freed_pcpu, &items); + list_splice(&bc->freed_nonpcpu, &items); + + mutex_unlock(&bc->lock); + + list_for_each_entry_safe(ck, n, &items, list) { + cond_resched(); + + list_del(&ck->list); + kfree(ck->k); + six_lock_exit(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + } + + if (atomic_long_read(&bc->nr_dirty) && + !bch2_journal_error(&c->journal) && + test_bit(BCH_FS_WAS_RW, &c->flags)) + panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", + atomic_long_read(&bc->nr_dirty)); + + if (atomic_long_read(&bc->nr_keys)) + panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", + atomic_long_read(&bc->nr_keys)); + + if (bc->table_init_done) + rhashtable_destroy(&bc->table); + + free_percpu(bc->pcpu_freed); +} + +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) +{ + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->freed_pcpu); + INIT_LIST_HEAD(&c->freed_nonpcpu); +} + +int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct shrinker *shrink; + +#ifdef __KERNEL__ + bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); + if (!bc->pcpu_freed) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; +#endif + + if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + + bc->table_init_done = true; + + shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); + if (!shrink) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + bc->shrink = shrink; + shrink->seeks = 0; + shrink->count_objects = bch2_btree_key_cache_count; + shrink->scan_objects = bch2_btree_key_cache_scan; + shrink->private_data = c; + shrinker_register(shrink); + return 0; +} + +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +{ + prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); + prt_newline(out); + prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); + prt_newline(out); + prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); + prt_newline(out); +} + +void bch2_btree_key_cache_exit(void) +{ + kmem_cache_destroy(bch2_key_cache); +} + +int __init bch2_btree_key_cache_init(void) +{ + bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); + if (!bch2_key_cache) + return -ENOMEM; + + return 0; +} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h new file mode 100644 index 000000000000..be3acde2caa0 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_H +#define _BCACHEFS_BTREE_KEY_CACHE_H + +static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 1024 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + + return nr_dirty > max_dirty; +} + +int bch2_btree_key_cache_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + +struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + +int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, + unsigned); + +bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, + struct btree_insert_entry *); +int bch2_btree_key_cache_flush(struct btree_trans *, + enum btree_id, struct bpos); +void bch2_btree_key_cache_drop(struct btree_trans *, + struct btree_path *); + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); +int bch2_fs_btree_key_cache_init(struct btree_key_cache *); + +void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); + +void bch2_btree_key_cache_exit(void); +int __init bch2_btree_key_cache_init(void); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h new file mode 100644 index 000000000000..290e4e57df5b --- /dev/null +++ b/fs/bcachefs/btree_key_cache_types.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H +#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H + +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + + struct list_head freed_pcpu; + size_t nr_freed_pcpu; + struct list_head freed_nonpcpu; + size_t nr_freed_nonpcpu; + + struct shrinker *shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c new file mode 100644 index 000000000000..3d48834d091f --- /dev/null +++ b/fs/bcachefs/btree_locking.c @@ -0,0 +1,817 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_locking.h" +#include "btree_types.h" + +static struct lock_class_key bch2_btree_node_lock_key; + +void bch2_btree_lock_init(struct btree_bkey_cached_common *b, + enum six_lock_init_flags flags) +{ + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); + lockdep_set_novalidate_class(&b->lock); +} + +#ifdef CONFIG_LOCKDEP +void bch2_assert_btree_nodes_not_locked(void) +{ +#if 0 + //Re-enable when lock_class_is_held() is merged: + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); +#endif +} +#endif + +/* Btree node locking: */ + +struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, + struct btree_path *skip, + struct btree_bkey_cached_common *b, + unsigned level) +{ + struct btree_path *path; + struct six_lock_count ret; + + memset(&ret, 0, sizeof(ret)); + + if (IS_ERR_OR_NULL(b)) + return ret; + + trans_for_each_path(trans, path) + if (path != skip && &path->l[level].b->c == b) { + int t = btree_node_locked_type(path, level); + + if (t != BTREE_NODE_UNLOCKED) + ret.n[t]++; + } + + return ret; +} + +/* unlock */ + +void bch2_btree_node_unlock_write(struct btree_trans *trans, + struct btree_path *path, struct btree *b) +{ + bch2_btree_node_unlock_write_inlined(trans, path, b); +} + +/* lock */ + +/* + * @trans wants to lock @b with type @type + */ +struct trans_waiting_for_lock { + struct btree_trans *trans; + struct btree_bkey_cached_common *node_want; + enum six_lock_type lock_want; + + /* for iterating over held locks :*/ + u8 path_idx; + u8 level; + u64 lock_start_time; +}; + +struct lock_graph { + struct trans_waiting_for_lock g[8]; + unsigned nr; +}; + +static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + prt_printf(out, "Found lock cycle (%u entries):", g->nr); + prt_newline(out); + + for (i = g->g; i < g->g + g->nr; i++) + bch2_btree_trans_to_text(out, i->trans); +} + +static noinline void print_chain(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g; i != g->g + g->nr; i++) { + if (i != g->g) + prt_str(out, "<- "); + prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + } + prt_newline(out); +} + +static void lock_graph_up(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + +static noinline void lock_graph_pop_all(struct lock_graph *g) +{ + while (g->nr) + lock_graph_up(g); +} + +static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ + g->g[g->nr++] = (struct trans_waiting_for_lock) { + .trans = trans, + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; +} + +static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ + closure_get(&trans->ref); + __lock_graph_down(g, trans); +} + +static bool lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { + while (g->g + g->nr > i) + lock_graph_up(g); + return true; + } + + return false; +} + +static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) +{ + if (i == g->g) { + trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); + return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + } else { + i->trans->lock_must_abort = true; + wake_up_process(i->trans->locking_wait.task); + return 0; + } +} + +static int btree_trans_abort_preference(struct btree_trans *trans) +{ + if (trans->lock_may_not_fail) + return 0; + if (trans->locking_wait.lock_want == SIX_LOCK_write) + return 1; + if (!trans->in_traverse_all) + return 2; + return 3; +} + +static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) +{ + struct trans_waiting_for_lock *i, *abort = NULL; + unsigned best = 0, pref; + int ret; + + if (lock_graph_remove_non_waiters(g)) + return 0; + + /* Only checking, for debugfs: */ + if (cycle) { + print_cycle(cycle, g); + ret = -1; + goto out; + } + + for (i = g->g; i < g->g + g->nr; i++) { + pref = btree_trans_abort_preference(i->trans); + if (pref > best) { + abort = i; + best = pref; + } + } + + if (unlikely(!best)) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); + + for (i = g->g; i < g->g + g->nr; i++) { + struct btree_trans *trans = i->trans; + + bch2_btree_trans_to_text(&buf, trans); + + prt_printf(&buf, "backtrace:"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + BUG(); + } + + ret = abort_lock(g, abort); +out: + if (ret) + while (g->nr) + lock_graph_up(g); + return ret; +} + +static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + struct printbuf *cycle) +{ + struct btree_trans *orig_trans = g->g->trans; + struct trans_waiting_for_lock *i; + + for (i = g->g; i < g->g + g->nr; i++) + if (i->trans == trans) { + closure_put(&trans->ref); + return break_cycle(g, cycle); + } + + if (g->nr == ARRAY_SIZE(g->g)) { + closure_put(&trans->ref); + + if (orig_trans->lock_may_not_fail) + return 0; + + while (g->nr) + lock_graph_up(g); + + if (cycle) + return 0; + + trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); + return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); + } + + __lock_graph_down(g, trans); + return 0; +} + +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) +{ + return t1 + t2 > 1; +} + +int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) +{ + struct lock_graph g; + struct trans_waiting_for_lock *top; + struct btree_bkey_cached_common *b; + struct btree_path *path; + unsigned path_idx; + int ret; + + if (trans->lock_must_abort) { + if (cycle) + return -1; + + trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); + } + + g.nr = 0; + lock_graph_down(&g, trans); +next: + if (!g.nr) + return 0; + + top = &g.g[g.nr - 1]; + + trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { + if (!path->nodes_locked) + continue; + + if (path_idx != top->path_idx) { + top->path_idx = path_idx; + top->level = 0; + top->lock_start_time = 0; + } + + for (; + top->level < BTREE_MAX_DEPTH; + top->level++, top->lock_start_time = 0) { + int lock_held = btree_node_locked_type(path, top->level); + + if (lock_held == BTREE_NODE_UNLOCKED) + continue; + + b = &READ_ONCE(path->l[top->level].b)->c; + + if (IS_ERR_OR_NULL(b)) { + /* + * If we get here, it means we raced with the + * other thread updating its btree_path + * structures - which means it can't be blocked + * waiting on a lock: + */ + if (!lock_graph_remove_non_waiters(&g)) { + /* + * If lock_graph_remove_non_waiters() + * didn't do anything, it must be + * because we're being called by debugfs + * checking for lock cycles, which + * invokes us on btree_transactions that + * aren't actually waiting on anything. + * Just bail out: + */ + lock_graph_pop_all(&g); + } + + goto next; + } + + if (list_empty_careful(&b->lock.wait_list)) + continue; + + raw_spin_lock(&b->lock.wait_lock); + list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { + BUG_ON(b != trans->locking); + + if (top->lock_start_time && + time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) + continue; + + top->lock_start_time = trans->locking_wait.start_time; + + /* Don't check for self deadlock: */ + if (trans == top->trans || + !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) + continue; + + closure_get(&trans->ref); + raw_spin_unlock(&b->lock.wait_lock); + + ret = lock_graph_descend(&g, trans, cycle); + if (ret) + return ret; + goto next; + + } + raw_spin_unlock(&b->lock.wait_lock); + } + } + + if (g.nr > 1 && cycle) + print_chain(cycle, &g); + lock_graph_up(&g); + goto next; +} + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) +{ + struct btree_trans *trans = p; + + return bch2_check_for_deadlock(trans, NULL); +} + +int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) +{ + int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; + int ret; + + /* + * Must drop our read locks before calling six_lock_write() - + * six_unlock() won't do wakeups until the reader count + * goes to 0, and it's safe because we have the node intent + * locked: + */ + six_lock_readers_add(&b->lock, -readers); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, + lock_may_not_fail, _RET_IP_); + six_lock_readers_add(&b->lock, readers); + + if (ret) + mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); + + return ret; +} + +void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + struct btree_path *linked; + unsigned i; + int ret; + + /* + * XXX BIG FAT NOTICE + * + * Drop all read locks before taking a write lock: + * + * This is a hack, because bch2_btree_node_lock_write_nofail() is a + * hack - but by dropping read locks first, this should never fail, and + * we only use this in code paths where whatever read locks we've + * already taken are no longer needed: + */ + + trans_for_each_path(trans, linked) { + if (!linked->nodes_locked) + continue; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_read_locked(linked, i)) { + btree_node_unlock(trans, linked, i); + btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); + } + } + + ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); +} + +/* relock */ + +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade, + struct get_locks_fail *f) +{ + unsigned l = path->level; + int fail_idx = -1; + + do { + if (!btree_path_node(path, l)) + break; + + if (!(upgrade + ? bch2_btree_node_upgrade(trans, path, l) + : bch2_btree_node_relock(trans, path, l))) { + fail_idx = l; + + if (f) { + f->l = l; + f->b = path->l[l].b; + } + } + + l++; + } while (l < path->locks_want); + + /* + * When we fail to get a lock, we have to ensure that any child nodes + * can't be relocked so bch2_btree_path_traverse has to walk back up to + * the node that we failed to relock: + */ + if (fail_idx >= 0) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + + do { + path->l[fail_idx].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + --fail_idx; + } while (fail_idx >= 0); + } + + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + bch2_trans_verify_locks(trans); + + return path->uptodate < BTREE_ITER_NEED_RELOCK; +} + +bool __bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level, + bool trace) +{ + struct btree *b = btree_path_node(path, level); + int want = __btree_lock_want(path, level); + + if (race_fault()) + goto fail; + + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, want))) { + mark_btree_node_locked(trans, path, level, want); + return true; + } +fail: + if (trace && !trans->notrace_relock_fail) + trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); + return false; +} + +/* upgrade */ + +bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + struct btree *b = path->l[level].b; + struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); + + if (!is_btree_node(path, level)) + return false; + + switch (btree_lock_want(path, level)) { + case BTREE_NODE_UNLOCKED: + BUG_ON(btree_node_locked(path, level)); + return true; + case BTREE_NODE_READ_LOCKED: + BUG_ON(btree_node_intent_locked(path, level)); + return bch2_btree_node_relock(trans, path, level); + case BTREE_NODE_INTENT_LOCKED: + break; + case BTREE_NODE_WRITE_LOCKED: + BUG(); + } + + if (btree_node_intent_locked(path, level)) + return true; + + if (race_fault()) + return false; + + if (btree_node_locked(path, level)) { + bool ret; + + six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); + ret = six_lock_tryupgrade(&b->c.lock); + six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); + + if (ret) + goto success; + } else { + if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) + goto success; + } + + /* + * Do we already have an intent lock via another path? If so, just bump + * lock count: + */ + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(trans, path, level); + goto success; + } + + trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); + return false; +success: + mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); + return true; +} + +/* Btree path locking: */ + +/* + * Only for btree_cache.c - only relocks intent locks + */ +int bch2_btree_path_relock_intent(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned l; + + for (l = path->level; + l < path->locks_want && btree_path_node(path, l); + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); + } + } + + return 0; +} + +__flatten +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + struct get_locks_fail f; + + return btree_path_get_locks(trans, path, false, &f); +} + +int __bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); + } + + return 0; +} + +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want, + struct get_locks_fail *f) +{ + EBUG_ON(path->locks_want >= new_locks_want); + + path->locks_want = new_locks_want; + + return btree_path_get_locks(trans, path, true, f); +} + +bool __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want, + struct get_locks_fail *f) +{ + struct btree_path *linked; + + if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) + return true; + + /* + * XXX: this is ugly - we'd prefer to not be mucking with other + * iterators in the btree_trans here. + * + * On failure to upgrade the iterator, setting iter->locks_want and + * calling get_locks() is sufficient to make bch2_btree_path_traverse() + * get the locks we want on transaction restart. + * + * But if this iterator was a clone, on transaction restart what we did + * to this iterator isn't going to be preserved. + * + * Possibly we could add an iterator field for the parent iterator when + * an iterator is a copy - for now, we'll just upgrade any other + * iterators with the same btree id. + * + * The code below used to be needed to ensure ancestor nodes get locked + * before interior nodes - now that's handled by + * bch2_btree_path_traverse_all(). + */ + if (!path->cached && !trans->in_traverse_all) + trans_for_each_path(trans, linked) + if (linked != path && + linked->cached == path->cached && + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; + btree_path_get_locks(trans, linked, true, NULL); + } + + return false; +} + +void __bch2_btree_path_downgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + unsigned l; + + if (trans->restarted) + return; + + EBUG_ON(path->locks_want < new_locks_want); + + path->locks_want = new_locks_want; + + while (path->nodes_locked && + (l = btree_path_highest_level_locked(path)) >= path->locks_want) { + if (l > path->level) { + btree_node_unlock(trans, path, l); + } else { + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); + mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); + } + break; + } + } + + bch2_btree_path_verify_locks(path); + + path->downgrade_seq++; + trace_path_downgrade(trans, _RET_IP_, path); +} + +/* Btree transaction locking: */ + +void bch2_trans_downgrade(struct btree_trans *trans) +{ + struct btree_path *path; + + if (trans->restarted) + return; + + trans_for_each_path(trans, path) + bch2_btree_path_downgrade(trans, path); +} + +int bch2_trans_relock(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + +void bch2_trans_unlock_noassert(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(trans, path); +} + +void bch2_trans_unlock(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(trans, path); +} + +void bch2_trans_unlock_long(struct btree_trans *trans) +{ + bch2_trans_unlock(trans); + bch2_trans_srcu_unlock(trans); +} + +bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->nodes_locked) + return true; + return false; +} + +int __bch2_trans_mutex_lock(struct btree_trans *trans, + struct mutex *lock) +{ + int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); + + if (ret) + mutex_unlock(lock); + return ret; +} + +/* Debug */ + +#ifdef CONFIG_BCACHEFS_DEBUG + +void bch2_btree_path_verify_locks(struct btree_path *path) +{ + unsigned l; + + if (!path->nodes_locked) { + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level)); + return; + } + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + int want = btree_lock_want(path, l); + int have = btree_node_locked_type(path, l); + + BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); + + BUG_ON(is_btree_node(path, l) && + (want == BTREE_NODE_UNLOCKED || + have != BTREE_NODE_WRITE_LOCKED) && + want != have); + } +} + +void bch2_trans_verify_locks(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify_locks(path); +} + +#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 index 000000000000..11b0a2c8cd69 --- /dev/null +++ b/fs/bcachefs/btree_locking.h @@ -0,0 +1,433 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H + +/* + * Only for internal btree use: + * + * The btree iterator tracks what locks it wants to take, and what locks it + * currently has - here we have wrappers for locking/unlocking btree nodes and + * updating the iterator state + */ + +#include "btree_iter.h" +#include "six.h" + +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); + +#ifdef CONFIG_LOCKDEP +void bch2_assert_btree_nodes_not_locked(void); +#else +static inline void bch2_assert_btree_nodes_not_locked(void) {} +#endif + +void bch2_trans_unlock_noassert(struct btree_trans *); + +static inline bool is_btree_node(struct btree_path *path, unsigned l) +{ + return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); +} + +static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) +{ + return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) + ? &trans->c->btree_transaction_stats[trans->fn_idx] + : NULL; +} + +/* matches six lock types */ +enum btree_node_locked_type { + BTREE_NODE_UNLOCKED = -1, + BTREE_NODE_READ_LOCKED = SIX_LOCK_read, + BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, + BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, +}; + +static inline int btree_node_locked_type(struct btree_path *path, + unsigned level) +{ + return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); +} + +static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) +{ + return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; +} + +static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) +{ + return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; +} + +static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) +{ + return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; +} + +static inline bool btree_node_locked(struct btree_path *path, unsigned level) +{ + return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; +} + +static inline void mark_btree_node_locked_noreset(struct btree_path *path, + unsigned level, + enum btree_node_locked_type type) +{ + /* relying on this to avoid a branch */ + BUILD_BUG_ON(SIX_LOCK_read != 0); + BUILD_BUG_ON(SIX_LOCK_intent != 1); + + path->nodes_locked &= ~(3U << (level << 1)); + path->nodes_locked |= (type + 1) << (level << 1); +} + +static inline void mark_btree_node_unlocked(struct btree_path *path, + unsigned level) +{ + EBUG_ON(btree_node_write_locked(path, level)); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); +} + +static inline void mark_btree_node_locked(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + enum btree_node_locked_type type) +{ + mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[level].lock_taken_time = local_clock(); +#endif +} + +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) +{ + return level < path->locks_want + ? SIX_LOCK_intent + : SIX_LOCK_read; +} + +static inline enum btree_node_locked_type +btree_lock_want(struct btree_path *path, int level) +{ + if (level < path->level) + return BTREE_NODE_UNLOCKED; + if (level < path->locks_want) + return BTREE_NODE_INTENT_LOCKED; + if (level == path->level) + return BTREE_NODE_READ_LOCKED; + return BTREE_NODE_UNLOCKED; +} + +static void btree_trans_lock_hold_time_update(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + struct btree_transaction_stats *s = btree_trans_stats(trans); + + if (s) + __bch2_time_stats_update(&s->lock_hold_times, + path->l[level].lock_taken_time, + local_clock()); +#endif +} + +/* unlock: */ + +static inline void btree_node_unlock(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + int lock_type = btree_node_locked_type(path, level); + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + if (lock_type != BTREE_NODE_UNLOCKED) { + six_unlock_type(&path->l[level].b->c.lock, lock_type); + btree_trans_lock_hold_time_update(trans, path, level); + } + mark_btree_node_unlocked(path, level); +} + +static inline int btree_path_lowest_level_locked(struct btree_path *path) +{ + return __ffs(path->nodes_locked) >> 1; +} + +static inline int btree_path_highest_level_locked(struct btree_path *path) +{ + return __fls(path->nodes_locked) >> 1; +} + +static inline void __bch2_btree_path_unlock(struct btree_trans *trans, + struct btree_path *path) +{ + btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + + while (path->nodes_locked) + btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); +} + +/* + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ +static inline void +bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) +{ + struct btree_path *linked; + + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); + EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); + + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); + + trans_for_each_path_with_node(trans, b, linked) + linked->l[b->c.level].lock_seq++; + + six_unlock_write(&b->c.lock); +} + +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); + +/* lock: */ + +static inline int __btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + bool lock_may_not_fail, + unsigned long ip) +{ + int ret; + + trans->lock_may_not_fail = lock_may_not_fail; + trans->lock_must_abort = false; + trans->locking = b; + + ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans, ip); + WRITE_ONCE(trans->locking, NULL); + WRITE_ONCE(trans->locking_wait.start_time, 0); + return ret; +} + +static inline int __must_check +btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + unsigned long ip) +{ + return __btree_node_lock_nopath(trans, b, type, false, ip); +} + +static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type) +{ + int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); + + BUG_ON(ret); +} + +/* + * Lock a btree node if we already have it locked on one of our linked + * iterators: + */ +static inline bool btree_node_lock_increment(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + unsigned level, + enum btree_node_locked_type want) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + if (&path->l[level].b->c == b && + btree_node_locked_type(path, level) >= want) { + six_lock_increment(&b->lock, (enum six_lock_type) want); + return true; + } + + return false; +} + +static inline int btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + unsigned level, + enum six_lock_type type, + unsigned long ip) +{ + int ret = 0; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + + if (likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || + !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[b->level].lock_taken_time = local_clock(); +#endif + } + + return ret; +} + +int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, + struct btree_bkey_cached_common *b, bool); + +static inline int __btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) +{ + EBUG_ON(&path->l[b->level].b->c != b); + EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); + EBUG_ON(!btree_node_intent_locked(path, b->level)); + + /* + * six locks are unfair, and read locks block while a thread wants a + * write lock: thus, we need to tell the cycle detector we have a write + * lock _before_ taking the lock: + */ + mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED); + + return likely(six_trylock_write(&b->lock)) + ? 0 + : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); +} + +static inline int __must_check +bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + return __btree_node_lock_write(trans, path, b, false); +} + +void bch2_btree_node_lock_write_nofail(struct btree_trans *, + struct btree_path *, + struct btree_bkey_cached_common *); + +/* relock: */ + +bool bch2_btree_path_relock_norestart(struct btree_trans *, + struct btree_path *, unsigned long); +int __bch2_btree_path_relock(struct btree_trans *, + struct btree_path *, unsigned long); + +static inline int bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_node_locked(path, path->level) + ? 0 + : __bch2_btree_path_relock(trans, path, trace_ip); +} + +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); + +static inline bool bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + EBUG_ON(btree_node_locked(path, level) && + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + + return likely(btree_node_locked(path, level)) || + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level, true)); +} + +static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + EBUG_ON(btree_node_locked(path, level) && + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + + return likely(btree_node_locked(path, level)) || + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level, false)); +} + +/* upgrade */ + + +struct get_locks_fail { + unsigned l; + struct btree *b; +}; + +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, + struct btree_path *, unsigned, + struct get_locks_fail *); + +bool __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned, + struct get_locks_fail *); + +static inline int bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + struct get_locks_fail f; + unsigned old_locks_want = path->locks_want; + + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + if (path->locks_want < new_locks_want + ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) + : path->uptodate == BTREE_ITER_UPTODATE) + return 0; + + trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, + old_locks_want, new_locks_want, &f); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); +} + +/* misc: */ + +static inline void btree_path_set_should_be_locked(struct btree_path *path) +{ + EBUG_ON(!btree_node_locked(path, path->level)); + EBUG_ON(path->uptodate); + + path->should_be_locked = true; +} + +static inline void __btree_path_set_level_up(struct btree_trans *trans, + struct btree_path *path, + unsigned l) +{ + btree_node_unlock(trans, path, l); + path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up); +} + +static inline void btree_path_set_level_up(struct btree_trans *trans, + struct btree_path *path) +{ + __btree_path_set_level_up(trans, path, path->level++); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +} + +/* debug */ + +struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, + struct btree_path *, + struct btree_bkey_cached_common *b, + unsigned); + +int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_btree_path_verify_locks(struct btree_path *); +void bch2_trans_verify_locks(struct btree_trans *); +#else +static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} +static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +#endif + +#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c new file mode 100644 index 000000000000..12907beda98c --- /dev/null +++ b/fs/bcachefs/btree_trans_commit.c @@ -0,0 +1,1162 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_gc.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "replicas.h" +#include "snapshot.h" + +#include <linux/prefetch.h> + +static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); + + if (j_k) + k = bkey_i_to_s_c(j_k); + } + + u = *k.k; + u.needs_whiteout = i->old_k.needs_whiteout; + + BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); + BUG_ON(i->old_v != k.v); +#endif +} + +static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +{ + return i->path->l + i->level; +} + +static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i != trans->updates && + insert_l(&i[0])->b == insert_l(&i[-1])->b; +} + +static inline bool same_leaf_as_next(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i + 1 < trans->updates + trans->nr_updates && + insert_l(&i[0])->b == insert_l(&i[1])->b; +} + +inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; + + if (unlikely(btree_node_just_written(b)) && + bch2_btree_post_write_cleanup(c, b)) + bch2_trans_node_reinit_iter(trans, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) + bch2_btree_init_next(trans, b); +} + +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int bch2_trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + EBUG_ON(trans->write_locked); + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + trans->write_locked = true; + return 0; +} + +static inline void bch2_trans_unlock_write(struct btree_trans *trans) +{ + if (likely(trans->write_locked)) { + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + trans->write_locked = false; + } +} + +/* Inserting into a given leaf node (last stage of insert): */ + +/* Handle overwrites and do insert, for non extents: */ +bool bch2_btree_bset_insert_key(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + struct bkey_packed *k; + unsigned clobber_u64s = 0, new_u64s = 0; + + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); + EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); + EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ + EBUG_ON(k && bkey_deleted(k)); + + /* Deleting, but not found? nothing to do: */ + if (bkey_deleted(&insert->k) && !k) + return false; + + if (bkey_deleted(&insert->k)) { + /* Deleting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) + push_whiteout(trans->c, b, insert->k.p); + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + bch2_bset_delete(b, k, clobber_u64s); + goto fix_iter; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + + return true; + } + + if (k) { + /* Overwriting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + insert->k.needs_whiteout = k->needs_whiteout; + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + goto overwrite; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + } + + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); +overwrite: + bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + new_u64s = k->u64s; +fix_iter: + if (clobber_u64s != new_u64s) + bch2_btree_node_iter_fix(trans, path, b, node_iter, k, + clobber_u64s, new_u64s); + return true; +} + +static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + unsigned i, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); + struct btree_trans *trans = bch2_trans_get(c); + unsigned long old, new, v; + unsigned idx = w - b->writes; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + v = READ_ONCE(b->flags); + + do { + old = new = v; + + if (!(old & (1 << BTREE_NODE_dirty)) || + !!(old & (1 << BTREE_NODE_write_idx)) != idx || + w->journal.seq != seq) + break; + + new &= ~BTREE_WRITE_TYPE_MASK; + new |= BTREE_WRITE_journal_reclaim; + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + + bch2_trans_put(trans); + return 0; +} + +int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 0, seq); +} + +int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 1, seq); +} + +inline void bch2_btree_add_journal_pin(struct bch_fs *c, + struct btree *b, u64 seq) +{ + struct btree_write *w = btree_current_write(b); + + bch2_journal_pin_add(&c->journal, seq, &w->journal, + btree_node_write_idx(b) == 0 + ? bch2_btree_node_flush0 + : bch2_btree_node_flush1); +} + +/** + * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node + * @trans: btree transaction object + * @path: path pointing to @insert's pos + * @insert: key to insert + * @journal_seq: sequence number of journal reservation + */ +inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, + struct btree_path *path, + struct bkey_i *insert, + u64 journal_seq) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(path)->b; + struct bset_tree *t = bset_tree_last(b); + struct bset *i = bset(b, t); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + + if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, + &path_l(path)->iter, insert))) + return; + + i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + + if (unlikely(!btree_node_dirty(b))) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_btree_node_dirty_acct(c, b); + } + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); +} + +/* Cached btree updates: */ + +/* Normal update interface: */ + +static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); + BUG_ON(i->cached != i->path->cached); + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && + btree_type_has_snapshots(i->btree_id) && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); +} + +static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) +{ + return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, + trans->journal_u64s, flags); +} + +#define JSET_ENTRY_LOG_U64s 4 + +static noinline void journal_transaction_name(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct jset_entry *entry = + bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_log, 0, 0, + JSET_ENTRY_LOG_U64s); + struct jset_entry_log *l = + container_of(entry, struct jset_entry_log, entry); + + strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); +} + +static inline int btree_key_can_insert(struct btree_trans *trans, + struct btree *b, unsigned u64s) +{ + struct bch_fs *c = trans->c; + + if (!bch2_btree_node_insert_fits(c, b, u64s)) + return -BCH_ERR_btree_insert_btree_node_full; + + return 0; +} + +noinline static int +btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned new_u64s) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_i *new_k; + int ret; + + bch2_trans_unlock_write(trans); + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(path->btree_id), new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + ret = bch2_trans_relock(trans) ?: + bch2_trans_lock_write(trans); + if (unlikely(ret)) { + kfree(new_k); + return ret; + } + + memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + +static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned u64s) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + struct btree_insert_entry *i; + unsigned new_u64s; + struct bkey_i *new_k; + + EBUG_ON(path->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c) && + !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return -BCH_ERR_btree_insert_need_journal_reclaim; + + /* + * bch2_varint_decode can read past the end of the buffer by at most 7 + * bytes (it won't be used): + */ + u64s += 1; + + if (u64s <= ck->u64s) + return 0; + + new_u64s = roundup_pow_of_two(u64s); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + if (unlikely(!new_k)) + return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + +/* Triggers: */ + +static int run_one_mem_trigger(struct btree_trans *trans, + struct btree_insert_entry *i, + unsigned flags) +{ + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + int ret; + + verify_update_old_key(trans, i); + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + + if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) + return 0; + + if (old_ops->atomic_trigger == new_ops->atomic_trigger) { + ret = bch2_mark_key(trans, i->btree_id, i->level, + old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + + _deleted.p = i->path->pos; + + ret = bch2_mark_key(trans, i->btree_id, i->level, + deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key(trans, i->btree_id, i->level, + old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +} + +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) +{ + /* + * Transactional triggers create new btree_insert_entries, so we can't + * pass them a pointer to a btree_insert_entry, that memory is going to + * move: + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + + verify_update_old_key(trans, i); + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && + old_ops->trans_trigger == new_ops->trans_trigger) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE| + i->flags) ?: 1; + } else if (overwrite && !i->overwrite_trigger_run) { + i->overwrite_trigger_run = true; + return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + } else if (!overwrite && !i->insert_trigger_run) { + i->insert_trigger_run = true; + return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + } else { + return 0; + } +} + +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; + bool trans_trigger_run; + int ret, overwrite; + + for (overwrite = 1; overwrite >= 0; --overwrite) { + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->btree_id != btree_id) + continue; + + ret = run_one_trans_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + } + + return 0; +} + +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + unsigned btree_id = 0; + int ret = 0; + + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + if (btree_id == BTREE_ID_alloc) + continue; + + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; + + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; + } + + trans_for_each_update(trans, i) { + if (i->btree_id > BTREE_ID_alloc) + break; + if (i->btree_id == BTREE_ID_alloc) { + ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + if (ret) + return ret; + break; + } + } + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); +#endif + return 0; +} + +static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + /* + * XXX: synchronization of cached update triggers with gc + * XXX: synchronization of interior node updates with gc + */ + BUG_ON(i->cached || i->level); + + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } + } + + return ret; +} + +static inline int +bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + struct btree_trans_commit_hook *h; + unsigned u64s = 0; + int ret; + + if (race_fault()) { + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); + return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + } + + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the + * amount of space available: + */ + + prefetch(&trans->c->journal.flags); + + trans_for_each_update(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + u64s += i->k->k.u64s; + ret = !i->cached + ? btree_key_can_insert(trans, insert_l(i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, i->path, u64s); + if (ret) { + *stopped_at = i; + return ret; + } + } + + if (trans->nr_wb_updates && + trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) + return -BCH_ERR_btree_insert_need_flush_buffer; + + /* + * Don't get journal reservation until after we know insert will + * succeed: + */ + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + ret = bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_NONBLOCK); + if (ret) + return ret; + + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); + } else { + trans->journal_res.seq = c->journal.replay_journal_seq; + } + + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: + */ + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (bch2_journal_seq_verify) + trans_for_each_update(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (bch2_inject_invalid_keys) + trans_for_each_update(trans, i) + i->k->k.version = MAX_VERSION; + } + + if (trans->fs_usage_deltas && + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + if (trans->nr_wb_updates) { + EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); + + ret = bch2_btree_insert_keys_write_buffer(trans); + if (ret) + goto revert_fs_usage; + } + + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + goto revert_fs_usage; + h = h->next; + } + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, i->flags); + if (ret) + goto fatal_err; + } + + if (unlikely(c->gc_pos.phase)) { + ret = bch2_trans_commit_run_gc_triggers(trans); + if (ret) + goto fatal_err; + } + + if (unlikely(trans->extra_journal_entries.nr)) { + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->extra_journal_entries.data, + trans->extra_journal_entries.nr); + + trans->journal_res.offset += trans->extra_journal_entries.nr; + trans->journal_res.u64s -= trans->extra_journal_entries.nr; + } + + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + struct journal *j = &c->journal; + struct jset_entry *entry; + + trans_for_each_update(trans, i) { + if (i->key_cache_already_flushed) + continue; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + verify_update_old_key(trans, i); + + if (trans->journal_transaction_names) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); + bkey_reassemble((struct bkey_i *) entry->start, + (struct bkey_s_c) { &i->old_k, i->old_v }); + } + + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + i->btree_id, i->level, + i->k->k.u64s); + bkey_copy((struct bkey_i *) entry->start, i->k); + } + + trans_for_each_wb_update(trans, wb) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + wb->btree, 0, + wb->k.k.u64s); + bkey_copy((struct bkey_i *) entry->start, &wb->k); + } + + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; + } + + trans_for_each_update(trans, i) { + i->k->k.needs_whiteout = false; + + if (!i->cached) { + u64 seq = trans->journal_res.seq; + + if (i->flags & BTREE_UPDATE_PREJOURNAL) + seq = i->seq; + + bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); + } else if (!i->key_cache_already_flushed) + bch2_btree_insert_key_cached(trans, flags, i); + else { + bch2_btree_key_cache_drop(trans, i->path); + btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + } + } + + return 0; +fatal_err: + bch2_fatal_error(c); +revert_fs_usage: + if (trans->fs_usage_deltas) + bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); + return ret; +} + +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + + trans_for_each_update(trans, i) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); + + trans_for_each_wb_update(trans, wb) + bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); +} + +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, + enum bkey_invalid_flags flags, + struct btree_insert_entry *i, + struct printbuf *err) +{ + struct bch_fs *c = trans->c; + + printbuf_reset(err); + prt_printf(err, "invalid bkey on insert from %s -> %ps", + trans->fn, (void *) i->ip_allocated); + prt_newline(err); + printbuf_indent_add(err, 2); + + bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); + prt_newline(err); + + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); + bch2_print_string_as_lines(KERN_ERR, err->buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + + return -EINVAL; +} + +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0, u64s_delta = 0; + + trans_for_each_update(trans, i) { + if (i->cached) + continue; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= i->old_btree_u64s; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { + ret = bch2_foreground_maybe_merge(trans, i->path, + i->level, flags); + if (unlikely(ret)) + return ret; + } + + u64s_delta = 0; + } + } + + ret = bch2_trans_lock_write(trans); + if (unlikely(ret)) + return ret; + + ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); + + if (!ret && unlikely(trans->journal_replay_not_finished)) + bch2_drop_overwrites_from_journal(trans); + + bch2_trans_unlock_write(trans); + + if (!ret && trans->journal_pin) + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + trans->journal_pin, NULL); + + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: + */ + bch2_journal_res_put(&c->journal, &trans->journal_res); + + return ret; +} + +static int journal_reclaim_wait_done(struct bch_fs *c) +{ + int ret = bch2_journal_error(&c->journal) ?: + !bch2_btree_key_cache_must_wait(c); + + if (!ret) + journal_reclaim_kick(&c->journal); + return ret; +} + +static noinline +int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry *i, + int ret, unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); + break; + case -BCH_ERR_journal_res_get_blocked: + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = drop_locks_do(trans, + bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_CHECK)); + break; + case -BCH_ERR_btree_insert_need_journal_reclaim: + bch2_trans_unlock(trans); + + trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); + if (ret < 0) + break; + + ret = bch2_trans_relock(trans); + break; + case -BCH_ERR_btree_insert_need_flush_buffer: { + struct btree_write_buffer *wb = &c->btree_write_buffer; + + ret = 0; + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_unlock(trans); + mutex_lock(&wb->flush_lock); + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_begin(trans); + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + } else { + mutex_unlock(&wb->flush_lock); + ret = bch2_trans_relock(trans); + } + } + break; + } + default: + BUG_ON(ret >= 0); + break; + } + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && + !(flags & BTREE_INSERT_NOWAIT) && + (flags & BTREE_INSERT_NOFAIL), c, + "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); + + return ret; +} + +static noinline int +bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + int ret; + + if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || + test_bit(BCH_FS_STARTED, &c->flags)) + return -BCH_ERR_erofs_trans_commit; + + ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); + if (ret) + return ret; + + bch2_write_ref_get(c, BCH_WRITE_REF_trans); + return 0; +} + +/* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to + * do. + */ +static noinline int +do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + if (ret) + break; + } + + return ret; +} + +int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i = NULL; + struct btree_write_buffered_key *wb; + int ret = 0; + + if (!trans->nr_updates && + !trans->nr_wb_updates && + !trans->extra_journal_entries.nr) + goto out_reset; + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out_reset; + + trans_for_each_update(trans, i) { + struct printbuf buf = PRINTBUF; + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags, &buf))) + ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); + btree_insert_entry_checks(trans, i); + printbuf_exit(&buf); + + if (ret) + return ret; + } + + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + ret = do_bch2_trans_commit_to_journal_replay(trans); + goto out_reset; + } + + if (!(flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + ret = bch2_trans_commit_get_rw_cold(trans, flags); + if (ret) + goto out_reset; + } + + if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && + mutex_trylock(&c->btree_write_buffer.flush_lock)) { + bch2_trans_begin(trans); + bch2_trans_unlock(trans); + + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + goto out; + } + + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + + trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + + trans_for_each_update(trans, i) { + EBUG_ON(!i->path->should_be_locked); + + ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + if (unlikely(ret)) + goto out; + + EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + + if (i->key_cache_already_flushed) + continue; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + /* we're going to journal the key being updated: */ + trans->journal_u64s += jset_u64s(i->k->k.u64s); + + /* and we're also going to log the overwrite: */ + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(i->old_k.u64s); + } + + trans_for_each_wb_update(trans, wb) + trans->journal_u64s += jset_u64s(wb->k.k.u64s); + + if (trans->extra_journal_res) { + ret = bch2_disk_reservation_add(c, trans->disk_res, + trans->extra_journal_res, + (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto err; + } +retry: + bch2_trans_verify_not_in_restart(trans); + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); + + /* make sure we didn't drop or screw up locks: */ + bch2_trans_verify_locks(trans); + + if (ret) + goto err; + + trace_and_count(c, transaction_commit, trans, _RET_IP_); +out: + if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + bch2_write_ref_put(c, BCH_WRITE_REF_trans); +out_reset: + if (!ret) + bch2_trans_downgrade(trans); + bch2_trans_reset_updates(trans); + + return ret; +err: + ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); + if (ret) + goto out; + + goto retry; +} diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 index 000000000000..60453ba86c4b --- /dev/null +++ b/fs/bcachefs/btree_types.h @@ -0,0 +1,725 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H + +#include <linux/list.h> +#include <linux/rhashtable.h> + +#include "btree_key_cache_types.h" +#include "buckets_types.h" +#include "darray.h" +#include "errcode.h" +#include "journal_types.h" +#include "replicas_types.h" +#include "six.h" + +struct open_bucket; +struct btree_update; +struct btree_trans; + +#define MAX_BSETS 3U + +struct btree_nr_keys { + + /* + * Amount of live metadata (i.e. size of node after a compaction) in + * units of u64s + */ + u16 live_u64s; + u16 bset_u64s[MAX_BSETS]; + + /* live keys only: */ + u16 packed_keys; + u16 unpacked_keys; +}; + +struct bset_tree { + /* + * We construct a binary tree in an array as if the array + * started at 1, so that things line up on the same cachelines + * better: see comments in bset.c at cacheline_to_bkey() for + * details + */ + + /* size of the binary tree and prev array */ + u16 size; + + /* function of size - precalculated for to_inorder() */ + u16 extra; + + u16 data_offset; + u16 aux_data_offset; + u16 end_offset; +}; + +struct btree_write { + struct journal_entry_pin journal; +}; + +struct btree_alloc { + struct open_buckets ob; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); +}; + +struct btree_bkey_cached_common { + struct six_lock lock; + u8 level; + u8 btree_id; + bool cached; +}; + +struct btree { + struct btree_bkey_cached_common c; + + struct rhash_head hash; + u64 hash_val; + + unsigned long flags; + u16 written; + u8 nsets; + u8 nr_key_bits; + u16 version_ondisk; + + struct bkey_format format; + + struct btree_node *data; + void *aux_data; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; + + struct btree_nr_keys nr; + u16 sib_u64s[2]; + u16 whiteout_u64s; + u8 byte_order; + u8 unpack_fn_len; + + struct btree_write writes[2]; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + + /* + * XXX: add a delete sequence number, so when bch2_btree_node_relock() + * fails because the lock sequence number has changed - i.e. the + * contents were modified - we can still relock the node if it's still + * the one we want, without redoing the traversal + */ + + /* + * For asynchronous splits/interior node updates: + * When we do a split, we allocate new child nodes and update the parent + * node to point to them: we update the parent in memory immediately, + * but then we must wait until the children have been written out before + * the update to the parent can be written - this is a list of the + * btree_updates that are blocking this node from being + * written: + */ + struct list_head write_blocked; + + /* + * Also for asynchronous splits/interior node updates: + * If a btree node isn't reachable yet, we don't want to kick off + * another write - because that write also won't yet be reachable and + * marking it as completed before it's reachable would be incorrect: + */ + unsigned long will_make_reachable; + + struct open_buckets ob; + + /* lru list */ + struct list_head list; +}; + +struct btree_cache { + struct rhashtable table; + bool table_init_done; + /* + * We never free a struct btree, except on shutdown - we just put it on + * the btree_cache_freed list and reuse it later. This simplifies the + * code, and it doesn't cost us much memory as the memory usage is + * dominated by buffers that hold the actual btree node data and those + * can be freed - and the number of struct btrees allocated is + * effectively bounded. + * + * btree_cache_freeable effectively is a small cache - we use it because + * high order page allocations can be rather expensive, and it's quite + * common to delete and allocate btree nodes in quick succession. It + * should never grow past ~2-3 nodes in practice. + */ + struct mutex lock; + struct list_head live; + struct list_head freeable; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; + + /* Number of elements in live + freeable lists */ + unsigned used; + unsigned reserve; + atomic_t dirty; + struct shrinker *shrink; + + /* + * If we need to allocate memory for a new btree node and that + * allocation fails, we can cannibalize another node in the btree cache + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: + */ + struct task_struct *alloc_lock; + struct closure_waitlist alloc_wait; +}; + +struct btree_node_iter { + struct btree_node_iter_set { + u16 k, end; + } data[MAX_BSETS]; +}; + +/* + * Iterate over all possible positions, synthesizing deleted keys for holes: + */ +static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0; +static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1; +/* + * Indicates that intent locks should be taken on leaf nodes, because we expect + * to be doing updates: + */ +static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2; +/* + * Causes the btree iterator code to prefetch additional btree nodes from disk: + */ +static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3; +/* + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for + * @pos or the first key strictly greater than @pos + */ +static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4; +static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; +static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6; +static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; +static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8; +static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; +static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; +static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; +static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13; +static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; +static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; +#define __BTREE_ITER_FLAGS_END 16 + +enum btree_path_uptodate { + BTREE_ITER_UPTODATE = 0, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, +}; + +#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) +#define TRACK_PATH_ALLOCATED +#endif + +struct btree_path { + u8 idx; + u8 sorted_idx; + u8 ref; + u8 intent_ref; + u32 alloc_seq; + u32 downgrade_seq; + + /* btree_iter_copy starts here: */ + struct bpos pos; + + enum btree_id btree_id:5; + bool cached:1; + bool preserve:1; + enum btree_path_uptodate uptodate:2; + /* + * When true, failing to relock this path will cause the transaction to + * restart: + */ + bool should_be_locked:1; + unsigned level:3, + locks_want:3; + u8 nodes_locked; + + struct btree_path_level { + struct btree *b; + struct btree_node_iter iter; + u32 lock_seq; +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + u64 lock_taken_time; +#endif + } l[BTREE_MAX_DEPTH]; +#ifdef TRACK_PATH_ALLOCATED + unsigned long ip_allocated; +#endif +}; + +static inline struct btree_path_level *path_l(struct btree_path *path) +{ + return path->l + path->level; +} + +static inline unsigned long btree_path_ip_allocated(struct btree_path *path) +{ +#ifdef TRACK_PATH_ALLOCATED + return path->ip_allocated; +#else + return _THIS_IP_; +#endif +} + +/* + * @pos - iterator's current position + * @level - current btree depth + * @locks_want - btree level below which we start taking intent locks + * @nodes_locked - bitmask indicating which nodes in @nodes are locked + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ +struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; + struct btree_path *update_path; + struct btree_path *key_cache_path; + + enum btree_id btree_id:8; + unsigned min_depth:3; + unsigned advanced:1; + + /* btree_iter_copy starts here: */ + u16 flags; + + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + + struct bpos pos; + /* + * Current unpacked key - so that bch2_btree_iter_next()/ + * bch2_btree_iter_next_slot() can correctly advance pos. + */ + struct bkey k; + + /* BTREE_ITER_WITH_JOURNAL: */ + size_t journal_idx; + struct bpos journal_pos; +#ifdef TRACK_PATH_ALLOCATED + unsigned long ip_allocated; +#endif +}; + +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 + +struct bkey_cached { + struct btree_bkey_cached_common c; + + unsigned long flags; + u16 u64s; + bool valid; + u32 btree_trans_barrier_seq; + struct bkey_cached_key key; + + struct rhash_head hash; + struct list_head list; + + struct journal_entry_pin journal; + u64 seq; + + struct bkey_i *k; +}; + +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) +{ + return !b->cached + ? container_of(b, struct btree, c)->key.k.p + : container_of(b, struct bkey_cached, c)->key.pos; +} + +struct btree_insert_entry { + unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; + u8 level:4; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; + bool key_cache_already_flushed:1; + /* + * @old_k may be a key from the journal; @old_btree_u64s always refers + * to the size of the key being overwritten in the btree: + */ + u8 old_btree_u64s; + struct bkey_i *k; + struct btree_path *path; + u64 seq; + /* key being overwritten: */ + struct bkey old_k; + const struct bch_val *old_v; + unsigned long ip_allocated; +}; + +#define BTREE_ITER_MAX 64 + +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); + +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; +}; + +#define BTREE_TRANS_MEM_MAX (1U << 16) + +#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 + +struct btree_trans { + struct bch_fs *c; + const char *fn; + struct closure ref; + struct list_head list; + u64 last_begin_time; + + u8 lock_may_not_fail; + u8 lock_must_abort; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + + int srcu_idx; + + u8 fn_idx; + u8 nr_sorted; + u8 nr_updates; + u8 nr_wb_updates; + u8 wb_updates_size; + bool srcu_held:1; + bool used_mempool:1; + bool in_traverse_all:1; + bool paths_sorted:1; + bool memory_allocation_failure:1; + bool journal_transaction_names:1; + bool journal_replay_not_finished:1; + bool notrace_relock_fail:1; + bool write_locked:1; + enum bch_errcode restarted:16; + u32 restart_count; + unsigned long last_begin_ip; + unsigned long last_restarted_ip; + unsigned long srcu_lock_time; + + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: + */ + unsigned extra_journal_res; + unsigned nr_max_paths; + + u64 paths_allocated; + + unsigned mem_top; + unsigned mem_max; + unsigned mem_bytes; + void *mem; + + u8 sorted[BTREE_ITER_MAX + 8]; + struct btree_path paths[BTREE_ITER_MAX]; + struct btree_insert_entry updates[BTREE_ITER_MAX]; + struct btree_write_buffered_key *wb_updates; + + /* update path: */ + struct btree_trans_commit_hook *hooks; + darray_u64 extra_journal_entries; + struct journal_entry_pin *journal_pin; + + struct journal_res journal_res; + u64 *journal_seq; + struct disk_reservation *disk_res; + unsigned journal_u64s; + struct replicas_delta_list *fs_usage_deltas; +}; + +#define BCH_BTREE_WRITE_TYPES() \ + x(initial, 0) \ + x(init_next_bset, 1) \ + x(cache_reclaim, 2) \ + x(journal_reclaim, 3) \ + x(interior, 4) + +enum btree_write_type { +#define x(t, n) BTREE_WRITE_##t, + BCH_BTREE_WRITE_TYPES() +#undef x + BTREE_WRITE_TYPE_NR, +}; + +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) +#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) + +#define BTREE_FLAGS() \ + x(read_in_flight) \ + x(read_error) \ + x(dirty) \ + x(need_write) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ + x(write_in_flight) \ + x(write_in_flight_inner) \ + x(just_written) \ + x(dying) \ + x(fake) \ + x(need_rewrite) \ + x(never_write) + +enum btree_flags { + /* First bits for btree node write type */ + BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, +#define x(flag) BTREE_NODE_##flag, + BTREE_FLAGS() +#undef x +}; + +#define x(flag) \ +static inline bool btree_node_ ## flag(struct btree *b) \ +{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void set_btree_node_ ## flag(struct btree *b) \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void clear_btree_node_ ## flag(struct btree *b) \ +{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } + +BTREE_FLAGS() +#undef x + +static inline struct btree_write *btree_current_write(struct btree *b) +{ + return b->writes + btree_node_write_idx(b); +} + +static inline struct btree_write *btree_prev_write(struct btree *b) +{ + return b->writes + (btree_node_write_idx(b) ^ 1); +} + +static inline struct bset_tree *bset_tree_last(struct btree *b) +{ + EBUG_ON(!b->nsets); + return b->set + b->nsets - 1; +} + +static inline void * +__btree_node_offset_to_ptr(const struct btree *b, u16 offset) +{ + return (void *) ((u64 *) b->data + 1 + offset); +} + +static inline u16 +__btree_node_ptr_to_offset(const struct btree *b, const void *p) +{ + u16 ret = (u64 *) p - 1 - (u64 *) b->data; + + EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); + return ret; +} + +static inline struct bset *bset(const struct btree *b, + const struct bset_tree *t) +{ + return __btree_node_offset_to_ptr(b, t->data_offset); +} + +static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) +{ + t->end_offset = + __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); +} + +static inline void set_btree_bset(struct btree *b, struct bset_tree *t, + const struct bset *i) +{ + t->data_offset = __btree_node_ptr_to_offset(b, i); + set_btree_bset_end(b, t); +} + +static inline struct bset *btree_bset_first(struct btree *b) +{ + return bset(b, b->set); +} + +static inline struct bset *btree_bset_last(struct btree *b) +{ + return bset(b, bset_tree_last(b)); +} + +static inline u16 +__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) +{ + return __btree_node_ptr_to_offset(b, k); +} + +static inline struct bkey_packed * +__btree_node_offset_to_key(const struct btree *b, u16 k) +{ + return __btree_node_offset_to_ptr(b, k); +} + +static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) +{ + return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); +} + +#define btree_bkey_first(_b, _t) \ +({ \ + EBUG_ON(bset(_b, _t)->start != \ + __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ + \ + bset(_b, _t)->start; \ +}) + +#define btree_bkey_last(_b, _t) \ +({ \ + EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ + vstruct_last(bset(_b, _t))); \ + \ + __btree_node_offset_to_key(_b, (_t)->end_offset); \ +}) + +static inline unsigned bset_u64s(struct bset_tree *t) +{ + return t->end_offset - t->data_offset - + sizeof(struct bset) / sizeof(u64); +} + +static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) +{ + return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; +} + +static inline unsigned bset_byte_offset(struct btree *b, void *i) +{ + return i - (void *) b->data; +} + +enum btree_node_type { + BKEY_TYPE_btree, +#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1, + BCH_BTREE_IDS() +#undef x + BKEY_TYPE_NR +}; + +/* Type of a key in btree @id at level @level: */ +static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) +{ + return level ? BKEY_TYPE_btree : (unsigned) id + 1; +} + +/* Type of keys @b contains: */ +static inline enum btree_node_type btree_node_type(struct btree *b) +{ + return __btree_node_type(b->c.level, b->c.btree_id); +} + +const char *bch2_btree_node_type_str(enum btree_node_type); + +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + (BIT_ULL(BKEY_TYPE_extents)| \ + BIT_ULL(BKEY_TYPE_alloc)| \ + BIT_ULL(BKEY_TYPE_inodes)| \ + BIT_ULL(BKEY_TYPE_stripes)| \ + BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + (BIT_ULL(BKEY_TYPE_alloc)| \ + BIT_ULL(BKEY_TYPE_inodes)| \ + BIT_ULL(BKEY_TYPE_stripes)| \ + BIT_ULL(BKEY_TYPE_snapshots)) + +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + +static inline bool btree_node_type_needs_gc(enum btree_node_type type) +{ + return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type); +} + +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) + BCH_BTREE_IDS() +#undef x + ; + + return (1U << type) & mask; +} + +static inline bool btree_id_is_extents(enum btree_id btree) +{ + return btree_node_type_is_extents(__btree_node_type(0, btree)); +} + +static inline bool btree_type_has_snapshots(enum btree_id id) +{ + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return (1U << id) & mask; +} + +static inline bool btree_type_has_snapshot_field(enum btree_id id) +{ + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return (1U << id) & mask; +} + +static inline bool btree_type_has_ptrs(enum btree_id id) +{ + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return (1U << id) & mask; +} + +struct btree_root { + struct btree *b; + + /* On disk root - see async splits: */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + u8 level; + u8 alive; + s8 error; +}; + +enum btree_gc_coalesce_fail_reason { + BTREE_GC_COALESCE_FAIL_RESERVE_GET, + BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, + BTREE_GC_COALESCE_FAIL_FORMAT_FITS, +}; + +enum btree_node_sibling { + btree_prev_sib, + btree_next_sib, +}; + +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c new file mode 100644 index 000000000000..25fdca00bf7b --- /dev/null +++ b/fs/bcachefs/btree_update.c @@ -0,0 +1,949 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_iter.h" +#include "btree_journal_iter.h" +#include "btree_locking.h" +#include "buckets.h" +#include "debug.h" +#include "errcode.h" +#include "error.h" +#include "extents.h" +#include "keylist.h" +#include "snapshot.h" +#include "trace.h" + +static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->cached, r->cached) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->k->k.p, r->k->k.p); +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags, + unsigned long ip); + +static noinline int extent_front_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bkey_i **insert, + enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *update; + int ret; + + update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) + return 0; + + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: + bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); + if (ret < 0) + return ret; + if (ret) + return 0; + + ret = bch2_btree_delete_at(trans, iter, flags); + if (ret) + return ret; + + *insert = update; + return 0; +} + +static noinline int extent_back_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + int ret; + + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: + bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); + if (ret < 0) + return ret; + if (ret) + return 0; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + return 0; +} + +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; + + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; + + pos.snapshot++; + + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter old_iter, new_iter = { NULL }; + struct bkey_s_c old_k, new_k; + snapshot_id_list s; + struct bkey_i *update; + int ret = 0; + + if (!bch2_snapshot_has_children(c, old_pos.snapshot)) + return 0; + + darray_init(&s); + + bch2_trans_iter_init(trans, &old_iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + !(ret = bkey_err(old_k)) && + bkey_eq(old_pos, old_k.k->p)) { + struct bpos whiteout_pos = + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + + if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || + snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) + continue; + + new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bkey_err(new_k); + if (ret) + break; + + if (new_k.k->type == KEY_TYPE_deleted) { + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = whiteout_pos; + update->k.type = KEY_TYPE_whiteout; + + ret = bch2_trans_update(trans, &new_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } + bch2_trans_iter_exit(trans, &new_iter); + + ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &new_iter); + bch2_trans_iter_exit(trans, &old_iter); + darray_exit(&s); + + return ret; +} + +int bch2_trans_update_extent_overwrite(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_update_flags flags, + struct bkey_s_c old, + struct bkey_s_c new) +{ + enum btree_id btree_id = iter->btree_id; + struct bkey_i *update; + struct bpos new_start = bkey_start_pos(new.k); + bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); + bool back_split = bkey_gt(old.k->p, new.k->p); + int ret = 0, compressed_sectors; + + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ + if (((front_split && back_split) || + ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(old))) + trans->extra_journal_res += compressed_sectors; + + if (front_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_back(new_start, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + /* If we're overwriting in a different snapshot - middle split: */ + if (old.k->p.snapshot != new.k->p.snapshot && + (front_split || back_split)) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new_start, update); + bch2_cut_back(new.k->p, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + if (bkey_le(old.k->p, new.k->p)) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bkey_init(&update->k); + update->k.p = old.k->p; + update->k.p.snapshot = new.k->p.snapshot; + + if (new.k->p.snapshot != old.k->p.snapshot) { + update->k.type = KEY_TYPE_whiteout; + } else if (btree_type_has_snapshots(btree_id)) { + ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (ret < 0) + return ret; + if (ret) + update->k.type = KEY_TYPE_whiteout; + } + + ret = bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + if (back_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new.k->p, update); + + ret = bch2_trans_update_by_path(trans, iter->path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags, _RET_IP_); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) +{ + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + + if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + ret = extent_front_merge(trans, &iter, k, &insert, flags); + if (ret) + goto err; + } + + goto next; + } + + while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { + bool done = bkey_lt(insert->k.p, k.k->p); + + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + goto err; + + if (done) + goto out; +next: + bch2_btree_iter_advance(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + } + + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = extent_back_merge(trans, &iter, insert, k); + if (ret) + goto err; + } +out: + if (!bkey_deleted(&insert->k)) + ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); +err: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static noinline int flush_new_cached_update(struct btree_trans *trans, + struct btree_path *path, + struct btree_insert_entry *i, + enum btree_update_flags flags, + unsigned long ip) +{ + struct btree_path *btree_path; + struct bkey k; + int ret; + + btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, btree_path, 0); + if (ret) + goto out; + + /* + * The old key in the insert entry might actually refer to an existing + * key in the btree that has been deleted from cache and not yet + * flushed. Check for this and skip the flush so we don't run triggers + * against a stale key. + */ + bch2_btree_path_peek_slot_exact(btree_path, &k); + if (!bkey_deleted(&k)) + goto out; + + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_NORUN; + + btree_path_set_should_be_locked(btree_path); + ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); +out: + bch2_path_put(trans, btree_path, true); + return ret; +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; + u64 seq = 0; + int cmp; + + EBUG_ON(!path->should_be_locked); + EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(!bpos_eq(k->k.p, path->pos)); + + /* + * The transaction journal res hasn't been allocated at this point. + * That occurs at commit time. Reuse the seq field to pass in the seq + * of a prejournaled key. + */ + if (flags & BTREE_UPDATE_PREJOURNAL) + seq = trans->journal_res.seq; + + n = (struct btree_insert_entry) { + .flags = flags, + .bkey_type = __btree_node_type(path->level, path->btree_id), + .btree_id = path->btree_id, + .level = path->level, + .cached = path->cached, + .path = path, + .k = k, + .seq = seq, + .ip_allocated = ip, + }; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +#endif + + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: + */ + trans_for_each_update(trans, i) { + cmp = btree_insert_entry_cmp(&n, i); + if (cmp <= 0) + break; + } + + if (!cmp && i < trans->updates + trans->nr_updates) { + EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + + bch2_path_put(trans, i->path, true); + i->flags = n.flags; + i->cached = n.cached; + i->k = n.k; + i->path = n.path; + i->seq = n.seq; + i->ip_allocated = n.ip_allocated; + } else { + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + + i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; + i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } + + __btree_path_get(i->path, true); + + /* + * If a key is present in the key cache, it must also exist in the + * btree - this is necessary for cache coherency. When iterating over + * a btree that's cached in the key cache, the btree iter code checks + * the key cache - but the key has to exist in the btree for that to + * work: + */ + if (path->cached && bkey_deleted(&i->old_k)) + return flush_new_cached_update(trans, path, i, flags, ip); + + return 0; +} + +static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_path *path) +{ + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + !bpos_eq(iter->key_cache_path->pos, iter->pos)) { + struct bkey_cached *ck; + int ret; + + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + } + + btree_path_set_should_be_locked(iter->key_cache_path); + } + + return 0; +} + +int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_path *path = iter->update_path ?: iter->path; + int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && + !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + + /* + * Ensure that updates to cached btrees go to the key cache: + */ + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + !path->cached && + !path->level && + btree_id_cached(trans->c, path->btree_id)) { + ret = bch2_trans_update_get_key_cache(trans, iter, path); + if (ret) + return ret; + + path = iter->key_cache_path; + } + + return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); +} + +/* + * Add a transaction update for a key that has already been journaled. + */ +int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, + struct btree_iter *iter, struct bkey_i *k, + enum btree_update_flags flags) +{ + trans->journal_res.seq = seq; + return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| + BTREE_UPDATE_PREJOURNAL); +} + +static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_copy(n, k); + return bch2_btree_insert_trans(trans, btree, n, 0); +} + +int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct btree_write_buffered_key *i; + int ret; + + EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + if (unlikely(trans->journal_replay_not_finished)) + return bch2_btree_insert_clone_trans(trans, btree, k); + + trans_for_each_wb_update(trans, i) { + if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { + bkey_copy(&i->k, k); + return 0; + } + } + + if (!trans->wb_updates || + trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_write_buffered_key *u; + + if (trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_transaction_stats *s = btree_trans_stats(trans); + + BUG_ON(trans->wb_updates_size > U8_MAX / 2); + trans->wb_updates_size = max(1, trans->wb_updates_size * 2); + if (s) + s->wb_updates_size = trans->wb_updates_size; + } + + u = bch2_trans_kmalloc_nomemzero(trans, + trans->wb_updates_size * + sizeof(struct btree_write_buffered_key)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + if (trans->nr_wb_updates) + memcpy(u, trans->wb_updates, trans->nr_wb_updates * + sizeof(struct btree_write_buffered_key)); + trans->wb_updates = u; + } + + trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { + .btree = btree, + }; + + bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); + trans->nr_wb_updates++; + + return 0; +} + +int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos end) +{ + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + k = bch2_btree_iter_prev(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bch2_btree_iter_advance(iter); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + BUG_ON(k.k->type != KEY_TYPE_deleted); + + if (bkey_gt(k.k->p, end)) { + ret = -BCH_ERR_ENOSPC_btree_slot; + goto err; + } + + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + h->next = trans->hooks; + trans->hooks = h; +} + +int bch2_btree_insert_nonextent(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k, + enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, k->k.p, + BTREE_ITER_CACHED| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/** + * bch2_btree_insert - insert keys into the extent btree + * @c: pointer to struct bch_fs + * @id: btree to insert into + * @k: key to insert + * @disk_res: must be non-NULL whenever inserting or potentially + * splitting data extents + * @flags: transaction commit flags + * + * Returns: 0 on success, error code on failure + */ +int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, + struct disk_reservation *disk_res, int flags) +{ + return bch2_trans_do(c, disk_res, NULL, flags, + bch2_btree_insert_trans(trans, id, k, 0)); +} + +int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, + unsigned len, unsigned update_flags) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = iter->pos; + bch2_key_resize(&k->k, len); + return bch2_trans_update(trans, iter, k, update_flags); +} + +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned update_flags) +{ + return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); +} + +int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = pos; + return bch2_trans_update_buffered(trans, btree, k); +} + +int bch2_btree_delete(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + unsigned update_flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, pos, + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, update_flags); + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + u32 restart_count = trans->restart_count; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; + + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + + /* + * This could probably be more efficient for extents: + */ + + /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the + * same). It's important that we delete starting from iter.pos + * because the range we want to delete could start in the middle + * of k. + * + * (bch2_btree_iter_peek() does guarantee that iter.pos >= + * bkey_start_pos(k.k)). + */ + delete.k.p = iter.pos; + + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); + + ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, + BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(trans->c, &disk_res); +err: + /* + * the bch2_trans_begin() call is in a weird place because we + * need to call it after every transaction commit, to avoid path + * overflow, but don't want to call it if the delete operation + * is a no-op and we have no work to do: + */ + bch2_trans_begin(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret ?: trans_was_restarted(trans, restart_count); +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + int ret = bch2_trans_run(c, + bch2_btree_delete_range_trans(trans, id, start, end, + update_flags, journal_seq)); + if (ret == -BCH_ERR_transaction_restart_nested) + ret = 0; + return ret; +} + +int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ + struct bkey_i *k; + int ret = 0; + + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + return bch2_trans_update_buffered(trans, btree, k); +} + +__printf(2, 0) +static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; + struct jset_entry_log *l; + unsigned u64s; + int ret; + + prt_vprintf(&buf, fmt, args); + ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; + + u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + + ret = darray_make_room(entries, jset_u64s(u64s)); + if (ret) + goto err; + + l = (void *) &darray_top(*entries); + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 1; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + memcpy(l->d, buf.buf, buf.pos); + while (buf.pos & 7) + l->d[buf.pos++] = '\0'; + + entries->nr += jset_u64s(u64s); +err: + printbuf_exit(&buf); + return ret; +} + +__printf(3, 0) +static int +__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + va_list args) +{ + int ret; + + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + } else { + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW|commit_flags, + __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); + } + + return ret; +} + +__printf(2, 3) +int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, 0, fmt, args); + va_end(args); + return ret; +} + +/* + * Use for logging messages during recovery to enable reserved space and avoid + * blocking. + */ +__printf(2, 3) +int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); + va_end(args); + return ret; +} diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 index 000000000000..9816d2286540 --- /dev/null +++ b/fs/bcachefs/btree_update.h @@ -0,0 +1,340 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_H +#define _BCACHEFS_BTREE_UPDATE_H + +#include "btree_iter.h" +#include "journal.h" + +struct bch_fs; +struct btree; + +void bch2_btree_node_prep_for_write(struct btree_trans *, + struct btree_path *, struct btree *); +bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_i *); + +int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); +int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); +void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); + +void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, + struct bkey_i *, u64); + +enum btree_insert_flags { + /* First bits for bch_watermark: */ + __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, + __BTREE_INSERT_NOCHECK_RW, + __BTREE_INSERT_LAZY_RW, + __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_JOURNAL_RECLAIM, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + +/* Don't check for -ENOSPC: */ +#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) + +#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) +#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) + +/* Insert is for journal replay - don't get journal reservations: */ +#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) + +/* Insert is being called from journal reclaim path: */ +#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) + +/* Don't block on allocation failure (for new btree nodes: */ +#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) +#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) + +#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) + +int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, + unsigned, unsigned); +int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); +int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); +int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); + +int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, + struct bkey_i *, enum btree_update_flags); + +int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, + enum btree_update_flags); +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, int flags); + +int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + struct bpos, struct bpos, unsigned, u64 *); +int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, unsigned, u64 *); + +int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); + +int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, + struct bpos, struct bpos); + +/* + * For use when splitting extents in existing snapshots: + * + * If @old_pos is an interior snapshot node, iterate over descendent snapshot + * nodes: for every descendent snapshot in whiche @old_pos is overwritten and + * not visible, emit a whiteout at @new_pos. + */ +static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id btree, + struct bpos old_pos, + struct bpos new_pos) +{ + if (!btree_type_has_snapshots(btree) || + bkey_eq(old_pos, new_pos)) + return 0; + + return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); +} + +int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, + enum btree_update_flags, + struct bkey_s_c, struct bkey_s_c); + +int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos); + +int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); +int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); +int __must_check bch2_trans_update_buffered(struct btree_trans *, + enum btree_id, struct bkey_i *); + +void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); +int __bch2_trans_commit(struct btree_trans *, unsigned); + +__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); +__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); + +/** + * bch2_trans_commit - insert keys at given iterator positions + * + * This is main entry point for btree updates. + * + * Return values: + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +static inline int bch2_trans_commit(struct btree_trans *trans, + struct disk_reservation *disk_res, + u64 *journal_seq, + unsigned flags) +{ + trans->disk_res = disk_res; + trans->journal_seq = journal_seq; + + return __bch2_trans_commit(trans, flags); +} + +#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + +#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + +#define bch2_trans_run(_c, _do) \ +({ \ + struct btree_trans *trans = bch2_trans_get(_c); \ + int _ret = (_do); \ + bch2_trans_put(trans); \ + _ret; \ +}) + +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ + bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) + +#define trans_for_each_update(_trans, _i) \ + for ((_i) = (_trans)->updates; \ + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + +#define trans_for_each_wb_update(_trans, _i) \ + for ((_i) = (_trans)->wb_updates; \ + (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ + (_i)++) + +static inline void bch2_trans_reset_updates(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); + + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->nr_wb_updates = 0; + trans->wb_updates = NULL; + trans->hooks = NULL; + trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset((void *) trans->fs_usage_deltas + + offsetof(struct replicas_delta_list, memset_start), 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); + } +} + +static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, + unsigned type, unsigned min_bytes) +{ + unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); + struct bkey_i *mut; + + if (type && k.k->type != type) + return ERR_PTR(-ENOENT); + + mut = bch2_trans_kmalloc_nomemzero(trans, bytes); + if (!IS_ERR(mut)) { + bkey_reassemble(mut, k); + + if (unlikely(bytes > bkey_bytes(k.k))) { + memset((void *) mut + bkey_bytes(k.k), 0, + bytes - bkey_bytes(k.k)); + mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64)); + } + } + return mut; +} + +static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +{ + return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); +} + +#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \ + bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k, unsigned flags, + unsigned type, unsigned min_bytes) +{ + struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); + int ret; + + if (IS_ERR(mut)) + return mut; + + ret = bch2_trans_update(trans, iter, mut, flags); + if (ret) + return ERR_PTR(ret); + + *k = bkey_i_to_s_c(mut); + return mut; +} + +static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k, unsigned flags) +{ + return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); +} + +#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \ + bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, unsigned min_bytes) +{ + struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, + btree_id, pos, flags|BTREE_ITER_INTENT, type); + struct bkey_i *ret = IS_ERR(k.k) + ? ERR_CAST(k.k) + : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); + if (IS_ERR(ret)) + bch2_trans_iter_exit(trans, iter); + return ret; +} + +static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); +} + +static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, unsigned min_bytes) +{ + struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, + btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); + int ret; + + if (IS_ERR(mut)) + return mut; + + ret = bch2_trans_update(trans, iter, mut, flags); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); + } + + return mut; +} + +static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned min_bytes) +{ + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); +} + +static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); +} + +#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ + bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ + _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, + unsigned flags, unsigned type, unsigned val_size) +{ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); + int ret; + + if (IS_ERR(k)) + return k; + + bkey_init(&k->k); + k->k.p = iter->pos; + k->k.type = type; + set_bkey_val_bytes(&k->k, val_size); + + ret = bch2_trans_update(trans, iter, k, flags); + if (unlikely(ret)) + return ERR_PTR(ret); + return k; +} + +#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \ + bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \ + KEY_TYPE_##_type, sizeof(struct bch_##_type))) + +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 index 000000000000..239fcc3c7c99 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c @@ -0,0 +1,2476 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_gc.h" +#include "btree_journal_iter.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "extents.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "keylist.h" +#include "replicas.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/random.h> + +static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, + struct btree_path *, struct btree *, + struct keylist *, unsigned); +static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + +static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + struct btree_path *path; + + path = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_NOPRESERVE| + BTREE_ITER_INTENT, _RET_IP_); + path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path; +} + +/* Debug code: */ + +/* + * Verify that child nodes correctly span parent node's range: + */ +static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos next_node = b->data->min_key; + struct btree_node_iter iter; + struct bkey_s_c k; + struct bkey_s_c_btree_ptr_v2 bp; + struct bkey unpacked; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + BUG_ON(!b->c.level); + + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + + bch2_btree_node_iter_init_from_start(&iter, b); + + while (1) { + k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + if (k.k->type != KEY_TYPE_btree_ptr_v2) + break; + bp = bkey_s_c_to_btree_ptr_v2(k); + + if (!bpos_eq(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); + bch2_bpos_to_text(&buf1, next_node); + bch2_bpos_to_text(&buf2, bp.v->min_key); + panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); + } + + bch2_btree_node_iter_advance(&iter, b); + + if (bch2_btree_node_iter_end(&iter)) { + if (!bpos_eq(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); + bch2_bpos_to_text(&buf1, b->key.k.p); + bch2_bpos_to_text(&buf2, k.k->p); + panic("expected end %s got %s\n", buf1.buf, buf2.buf); + } + break; + } + + next_node = bpos_successor(k.k->p); + } +#endif +} + +/* Calculate ideal packed bkey format for new btree nodes: */ + +static void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) +{ + struct bkey_packed *k; + struct bset_tree *t; + struct bkey uk; + + for_each_bset(b, t) + bset_tree_for_each_key(b, t, k) + if (!bkey_deleted(k)) { + uk = bkey_unpack_key(b, k); + bch2_bkey_format_add_key(s, &uk); + } +} + +static struct bkey_format bch2_btree_calc_format(struct btree *b) +{ + struct bkey_format_state s; + + bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, b->data->min_key); + bch2_bkey_format_add_pos(&s, b->data->max_key); + __bch2_btree_calc_format(&s, b); + + return bch2_bkey_format_done(&s); +} + +static size_t btree_node_u64s_with_format(struct btree_nr_keys nr, + struct bkey_format *old_f, + struct bkey_format *new_f) +{ + /* stupid integer promotion rules */ + ssize_t delta = + (((int) new_f->key_u64s - old_f->key_u64s) * + (int) nr.packed_keys) + + (((int) new_f->key_u64s - BKEY_U64s) * + (int) nr.unpacked_keys); + + BUG_ON(delta + nr.live_u64s < 0); + + return nr.live_u64s + delta; +} + +/** + * bch2_btree_node_format_fits - check if we could rewrite node with a new format + * + * @c: filesystem handle + * @b: btree node to rewrite + * @nr: number of keys for new node (i.e. b->nr) + * @new_f: bkey format to translate keys to + * + * Returns: true if all re-packed keys will be able to fit in a new node. + * + * Assumes all keys will successfully pack with the new format. + */ +static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, + struct btree_nr_keys nr, + struct bkey_format *new_f) +{ + size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f); + + return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); +} + +/* Btree node freeing/allocation: */ + +static void __btree_node_free(struct bch_fs *c, struct btree *b) +{ + trace_and_count(c, btree_node_free, c, b); + + BUG_ON(btree_node_write_blocked(b)); + BUG_ON(btree_node_dirty(b)); + BUG_ON(btree_node_need_write(b)); + BUG_ON(b == btree_node_root(c, b)); + BUG_ON(b->ob.nr); + BUG_ON(!list_empty(&b->write_blocked)); + BUG_ON(b->will_make_reachable); + + clear_btree_node_noevict(b); + + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); +} + +static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; + unsigned level = b->c.level; + + bch2_btree_node_lock_write_nofail(trans, path, &b->c); + bch2_btree_node_hash_remove(&c->btree_cache, b); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); + + trans_for_each_path(trans, path) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} + +static void bch2_btree_node_free_never_used(struct btree_update *as, + struct btree_trans *trans, + struct btree *b) +{ + struct bch_fs *c = as->c; + struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; + struct btree_path *path; + unsigned level = b->c.level; + + BUG_ON(!list_empty(&b->write_blocked)); + BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); + + b->will_make_reachable = 0; + closure_put(&as->cl); + + clear_btree_node_will_make_reachable(b); + clear_btree_node_accessed(b); + clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + + mutex_lock(&c->btree_cache.lock); + list_del_init(&b->list); + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + + BUG_ON(p->nr >= ARRAY_SIZE(p->b)); + p->b[p->nr++] = b; + + six_unlock_intent(&b->c.lock); + + trans_for_each_path(trans, path) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} + +static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + struct disk_reservation *res, + struct closure *cl, + bool interior_node, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct write_point *wp; + struct btree *b; + BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + struct open_buckets obs = { .nr = 0 }; + struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + ? BTREE_NODE_RESERVE + : 0; + int ret; + + mutex_lock(&c->btree_reserve_cache_lock); + if (c->btree_reserve_cache_nr > nr_reserve) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + obs = a->ob; + bkey_copy(&tmp.k, &a->k); + mutex_unlock(&c->btree_reserve_cache_lock); + goto mem_alloc; + } + mutex_unlock(&c->btree_reserve_cache_lock); + +retry: + ret = bch2_alloc_sectors_start_trans(trans, + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, + writepoint_ptr(&c->btree_write_point), + &devs_have, + res->nr_replicas, + c->opts.metadata_replicas_required, + watermark, 0, cl, &wp); + if (unlikely(ret)) + return ERR_PTR(ret); + + if (wp->sectors_free < btree_sectors(c)) { + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (ob->sectors_free < btree_sectors(c)) + ob->sectors_free = 0; + + bch2_alloc_sectors_done(c, wp); + goto retry; + } + + bkey_btree_ptr_v2_init(&tmp.k); + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); + + bch2_open_bucket_get(c, wp, &obs); + bch2_alloc_sectors_done(c, wp); +mem_alloc: + b = bch2_btree_node_mem_alloc(trans, interior_node); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + + /* we hold cannibalize_lock: */ + BUG_ON(IS_ERR(b)); + BUG_ON(b->ob.nr); + + bkey_copy(&b->key, &tmp.k); + b->ob = obs; + + return b; +} + +static struct btree *bch2_btree_node_alloc(struct btree_update *as, + struct btree_trans *trans, + unsigned level) +{ + struct bch_fs *c = as->c; + struct btree *b; + struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; + int ret; + + BUG_ON(level >= BTREE_MAX_DEPTH); + BUG_ON(!p->nr); + + b = p->b[--p->nr]; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + + set_btree_node_accessed(b); + set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); + b->c.level = level; + b->c.btree_id = as->btree_id; + b->version_ondisk = c->sb.version; + + memset(&b->nr, 0, sizeof(b->nr)); + b->data->magic = cpu_to_le64(bset_magic(c)); + memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); + b->data->flags = 0; + SET_BTREE_NODE_ID(b->data, as->btree_id); + SET_BTREE_NODE_LEVEL(b->data, level); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); + + bp->v.mem_ptr = 0; + bp->v.seq = b->data->keys.seq; + bp->v.sectors_written = 0; + } + + SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); + + bch2_btree_build_aux_trees(b); + + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); + BUG_ON(ret); + + trace_and_count(c, btree_node_alloc, c, b); + bch2_increment_clock(c, btree_sectors(c), WRITE); + return b; +} + +static void btree_set_min(struct btree *b, struct bpos pos) +{ + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; + b->data->min_key = pos; +} + +static void btree_set_max(struct btree *b, struct bpos pos) +{ + b->key.k.p = pos; + b->data->max_key = pos; +} + +static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree_trans *trans, + struct btree *b) +{ + struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); + struct bkey_format format = bch2_btree_calc_format(b); + + /* + * The keys might expand with the new format - if they wouldn't fit in + * the btree node anymore, use the old format for now: + */ + if (!bch2_btree_node_format_fits(as->c, b, b->nr, &format)) + format = b->format; + + SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); + + btree_set_min(n, b->data->min_key); + btree_set_max(n, b->data->max_key); + + n->data->format = format; + btree_node_set_format(n, format); + + bch2_btree_sort_into(as->c, n, b); + + btree_node_reset_sib_u64s(n); + return n; +} + +static struct btree *__btree_root_alloc(struct btree_update *as, + struct btree_trans *trans, unsigned level) +{ + struct btree *b = bch2_btree_node_alloc(as, trans, level); + + btree_set_min(b, POS_MIN); + btree_set_max(b, SPOS_MAX); + b->data->format = bch2_btree_calc_format(b); + + btree_node_set_format(b, b->data->format); + bch2_btree_build_aux_trees(b); + + return b; +} + +static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) +{ + struct bch_fs *c = as->c; + struct prealloc_nodes *p; + + for (p = as->prealloc_nodes; + p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); + p++) { + while (p->nr) { + struct btree *b = p->b[--p->nr]; + + mutex_lock(&c->btree_reserve_cache_lock); + + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { + struct btree_alloc *a = + &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + + a->ob = b->ob; + b->ob.nr = 0; + bkey_copy(&a->k, &b->key); + } else { + bch2_open_buckets_put(c, &b->ob); + } + + mutex_unlock(&c->btree_reserve_cache_lock); + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } + } +} + +static int bch2_btree_reserve_get(struct btree_trans *trans, + struct btree_update *as, + unsigned nr_nodes[2], + unsigned flags, + struct closure *cl) +{ + struct bch_fs *c = as->c; + struct btree *b; + unsigned interior; + int ret = 0; + + BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); + + /* + * Protects reaping from the btree node cache and using the btree node + * open bucket reserve: + * + * BTREE_INSERT_NOWAIT only applies to btree node allocation, not + * blocking on this lock: + */ + ret = bch2_btree_cache_cannibalize_lock(c, cl); + if (ret) + return ret; + + for (interior = 0; interior < 2; interior++) { + struct prealloc_nodes *p = as->prealloc_nodes + interior; + + while (p->nr < nr_nodes[interior]) { + b = __bch2_btree_node_alloc(trans, &as->disk_res, + flags & BTREE_INSERT_NOWAIT ? NULL : cl, + interior, flags); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto err; + } + + p->b[p->nr++] = b; + } + } +err: + bch2_btree_cache_cannibalize_unlock(c); + return ret; +} + +/* Asynchronous interior node update machinery */ + +static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) +{ + struct bch_fs *c = as->c; + + if (as->took_gc_lock) + up_read(&c->gc_lock); + as->took_gc_lock = false; + + bch2_journal_pin_drop(&c->journal, &as->journal); + bch2_journal_pin_flush(&c->journal, &as->journal); + bch2_disk_reservation_put(c, &as->disk_res); + bch2_btree_reserve_put(as, trans); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], + as->start_time); + + mutex_lock(&c->btree_interior_update_lock); + list_del(&as->unwritten_list); + list_del(&as->list); + + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); + + /* + * Have to do the wakeup with btree_interior_update_lock still held, + * since being on btree_interior_update_list is our ref on @c: + */ + closure_wake_up(&c->btree_interior_update_wait); + + mutex_unlock(&c->btree_interior_update_lock); +} + +static void btree_update_add_key(struct btree_update *as, + struct keylist *keys, struct btree *b) +{ + struct bkey_i *k = &b->key; + + BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > + ARRAY_SIZE(as->_old_keys)); + + bkey_copy(keys->top, k); + bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; + + bch2_keylist_push(keys); +} + +/* + * The transactional part of an interior btree node update, where we journal the + * update we did to the interior node and update alloc info: + */ +static int btree_update_nodes_written_trans(struct btree_trans *trans, + struct btree_update *as) +{ + struct bkey_i *k; + int ret; + + ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); + if (ret) + return ret; + + memcpy(&darray_top(trans->extra_journal_entries), + as->journal_entries, + as->journal_u64s * sizeof(u64)); + trans->extra_journal_entries.nr += as->journal_u64s; + + trans->journal_pin = &as->journal; + + for_each_keylist_key(&as->old_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); + if (ret) + return ret; + } + + for_each_keylist_key(&as->new_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); + if (ret) + return ret; + } + + return 0; +} + +static void btree_update_nodes_written(struct btree_update *as) +{ + struct bch_fs *c = as->c; + struct btree *b; + struct btree_trans *trans = bch2_trans_get(c); + u64 journal_seq = 0; + unsigned i; + int ret; + + /* + * If we're already in an error state, it might be because a btree node + * was never written, and we might be trying to free that same btree + * node here, but it won't have been marked as allocated and we'll see + * spurious disk usage inconsistencies in the transactional part below + * if we don't skip it: + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + /* + * Wait for any in flight writes to finish before we free the old nodes + * on disk: + */ + for (i = 0; i < as->nr_old_nodes; i++) { + __le64 seq; + + b = as->old_nodes[i]; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + seq = b->data ? b->data->keys.seq : 0; + six_unlock_read(&b->c.lock); + + if (seq == as->old_nodes_seq[i]) + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, + TASK_UNINTERRUPTIBLE); + } + + /* + * We did an update to a parent node where the pointers we added pointed + * to child nodes that weren't written yet: now, the child nodes have + * been written so we can write out the update to the interior node. + */ + + /* + * We can't call into journal reclaim here: we'd block on the journal + * reclaim lock, but we may need to release the open buckets we have + * pinned in order for other btree updates to make forward progress, and + * journal reclaim does btree updates when flushing bkey_cached entries, + * which may require allocations as well. + */ + ret = commit_do(trans, &as->disk_res, &journal_seq, + BCH_WATERMARK_reclaim| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM, + btree_update_nodes_written_trans(trans, as)); + bch2_trans_unlock(trans); + + bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, + "%s(): error %s", __func__, bch2_err_str(ret)); +err: + if (as->b) { + struct btree_path *path; + + b = as->b; + path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); + /* + * @b is the node we did the final insert into: + * + * On failure to get a journal reservation, we still have to + * unblock the write and allow most of the write path to happen + * so that shutdown works, but the i->journal_seq mechanism + * won't work to prevent the btree write from being visible (we + * didn't get a journal sequence number) - instead + * __bch2_btree_node_write() doesn't do the actual write if + * we're in journal error state: + */ + + /* + * Ensure transaction is unlocked before using + * btree_node_lock_nopath() (the use of which is always suspect, + * we need to work on removing this in the future) + * + * It should be, but get_unlocked_mut_path() -> bch2_path_get() + * calls bch2_path_upgrade(), before we call path_make_mut(), so + * we may rarely end up with a locked path besides the one we + * have here: + */ + bch2_trans_unlock(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED); + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); + path->l[b->c.level].b = b; + + bch2_btree_node_lock_write_nofail(trans, path, &b->c); + + mutex_lock(&c->btree_interior_update_lock); + + list_del(&as->write_blocked_list); + if (list_empty(&b->write_blocked)) + clear_btree_node_write_blocked(b); + + /* + * Node might have been freed, recheck under + * btree_interior_update_lock: + */ + if (as->b == b) { + BUG_ON(!b->c.level); + BUG_ON(!btree_node_dirty(b)); + + if (!ret) { + struct bset *last = btree_bset_last(b); + + last->journal_seq = cpu_to_le64( + max(journal_seq, + le64_to_cpu(last->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + } else { + /* + * If we didn't get a journal sequence number we + * can't write this btree node, because recovery + * won't know to ignore this write: + */ + set_btree_node_never_write(b); + } + } + + mutex_unlock(&c->btree_interior_update_lock); + + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); + six_unlock_write(&b->c.lock); + + btree_node_write_if_need(c, b, SIX_LOCK_intent); + btree_node_unlock(trans, path, b->c.level); + bch2_path_put(trans, path, true); + } + + bch2_journal_pin_drop(&c->journal, &as->journal); + + mutex_lock(&c->btree_interior_update_lock); + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; + + BUG_ON(b->will_make_reachable != (unsigned long) as); + b->will_make_reachable = 0; + clear_btree_node_will_make_reachable(b); + } + mutex_unlock(&c->btree_interior_update_lock); + + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + } + + for (i = 0; i < as->nr_open_buckets; i++) + bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); + + bch2_btree_update_free(as, trans); + bch2_trans_put(trans); +} + +static void btree_interior_update_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, btree_interior_update_work); + struct btree_update *as; + + while (1) { + mutex_lock(&c->btree_interior_update_lock); + as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, + struct btree_update, unwritten_list); + if (as && !as->nodes_written) + as = NULL; + mutex_unlock(&c->btree_interior_update_lock); + + if (!as) + break; + + btree_update_nodes_written(as); + } +} + +static CLOSURE_CALLBACK(btree_update_set_nodes_written) +{ + closure_type(as, struct btree_update, cl); + struct bch_fs *c = as->c; + + mutex_lock(&c->btree_interior_update_lock); + as->nodes_written = true; + mutex_unlock(&c->btree_interior_update_lock); + + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); +} + +/* + * We're updating @b with pointers to nodes that haven't finished writing yet: + * block @b from being written until @as completes + */ +static void btree_update_updated_node(struct btree_update *as, struct btree *b) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(!btree_node_dirty(b)); + BUG_ON(!b->c.level); + + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; + + set_btree_node_write_blocked(b); + list_add(&as->write_blocked_list, &b->write_blocked); + + mutex_unlock(&c->btree_interior_update_lock); +} + +static void btree_update_reparent(struct btree_update *as, + struct btree_update *child) +{ + struct bch_fs *c = as->c; + + lockdep_assert_held(&c->btree_interior_update_lock); + + child->b = NULL; + child->mode = BTREE_INTERIOR_UPDATING_AS; + + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); +} + +static void btree_update_updated_root(struct btree_update *as, struct btree *b) +{ + struct bkey_i *insert = &b->key; + struct bch_fs *c = as->c; + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); + + as->journal_u64s += + journal_entry_set((void *) &as->journal_entries[as->journal_u64s], + BCH_JSET_ENTRY_btree_root, + b->c.btree_id, b->c.level, + insert, insert->k.u64s); + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + as->mode = BTREE_INTERIOR_UPDATING_ROOT; + mutex_unlock(&c->btree_interior_update_lock); +} + +/* + * bch2_btree_update_add_new_node: + * + * This causes @as to wait on @b to be written, before it gets to + * bch2_btree_update_nodes_written + * + * Additionally, it sets b->will_make_reachable to prevent any additional writes + * to @b from happening besides the first until @b is reachable on disk + * + * And it adds @b to the list of @as's new nodes, so that we can update sector + * counts in bch2_btree_update_nodes_written: + */ +static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) +{ + struct bch_fs *c = as->c; + + closure_get(&as->cl); + + mutex_lock(&c->btree_interior_update_lock); + BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); + BUG_ON(b->will_make_reachable); + + as->new_nodes[as->nr_new_nodes++] = b; + b->will_make_reachable = 1UL|(unsigned long) as; + set_btree_node_will_make_reachable(b); + + mutex_unlock(&c->btree_interior_update_lock); + + btree_update_add_key(as, &as->new_keys, b); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; + unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; + + bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = + cpu_to_le16(sectors); + } +} + +/* + * returns true if @b was a new node + */ +static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) +{ + struct btree_update *as; + unsigned long v; + unsigned i; + + mutex_lock(&c->btree_interior_update_lock); + /* + * When b->will_make_reachable != 0, it owns a ref on as->cl that's + * dropped when it gets written by bch2_btree_complete_write - the + * xchg() is for synchronization with bch2_btree_complete_write: + */ + v = xchg(&b->will_make_reachable, 0); + clear_btree_node_will_make_reachable(b); + as = (struct btree_update *) (v & ~1UL); + + if (!as) { + mutex_unlock(&c->btree_interior_update_lock); + return; + } + + for (i = 0; i < as->nr_new_nodes; i++) + if (as->new_nodes[i] == b) + goto found; + + BUG(); +found: + array_remove_item(as->new_nodes, as->nr_new_nodes, i); + mutex_unlock(&c->btree_interior_update_lock); + + if (v & 1) + closure_put(&as->cl); +} + +static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) +{ + while (b->ob.nr) + as->open_buckets[as->nr_open_buckets++] = + b->ob.v[--b->ob.nr]; +} + +/* + * @b is being split/rewritten: it may have pointers to not-yet-written btree + * nodes and thus outstanding btree_updates - redirect @b's + * btree_updates to point to this btree_update: + */ +static void bch2_btree_interior_update_will_free_node(struct btree_update *as, + struct btree *b) +{ + struct bch_fs *c = as->c; + struct btree_update *p, *n; + struct btree_write *w; + + set_btree_node_dying(b); + + if (btree_node_fake(b)) + return; + + mutex_lock(&c->btree_interior_update_lock); + + /* + * Does this node have any btree_update operations preventing + * it from being written? + * + * If so, redirect them to point to this btree_update: we can + * write out our new nodes, but we won't make them visible until those + * operations complete + */ + list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { + list_del_init(&p->write_blocked_list); + btree_update_reparent(as, p); + + /* + * for flush_held_btree_writes() waiting on updates to flush or + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); + } + + clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + clear_btree_node_write_blocked(b); + + /* + * Does this node have unwritten data that has a pin on the journal? + * + * If so, transfer that pin to the btree_update operation - + * note that if we're freeing multiple nodes, we only need to keep the + * oldest pin of any of the nodes we're freeing. We'll release the pin + * when the new nodes are persistent and reachable on disk: + */ + w = btree_current_write(b); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_drop(&c->journal, &w->journal); + + w = btree_prev_write(b); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_drop(&c->journal, &w->journal); + + mutex_unlock(&c->btree_interior_update_lock); + + /* + * Is this a node that isn't reachable on disk yet? + * + * Nodes that aren't reachable yet have writes blocked until they're + * reachable - now that we've cancelled any pending writes and moved + * things waiting on that write to wait on this update, we can drop this + * node from the list of nodes that the other update is making + * reachable, prior to freeing it: + */ + btree_update_drop_new_node(c, b); + + btree_update_add_key(as, &as->old_keys, b); + + as->old_nodes[as->nr_old_nodes] = b; + as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; + as->nr_old_nodes++; +} + +static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) +{ + struct bch_fs *c = as->c; + u64 start_time = as->start_time; + + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + + if (as->took_gc_lock) + up_read(&as->c->gc_lock); + as->took_gc_lock = false; + + bch2_btree_reserve_put(as, trans); + + continue_at(&as->cl, btree_update_set_nodes_written, + as->c->btree_interior_update_worker); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], + start_time); +} + +static struct btree_update * +bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + unsigned level, bool split, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_update *as; + u64 start_time = local_clock(); + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; + unsigned nr_nodes[2] = { 0, 0 }; + unsigned update_level = level; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + int ret = 0; + u32 restart_count = trans->restart_count; + + BUG_ON(!path->should_be_locked); + + if (watermark == BCH_WATERMARK_copygc) + watermark = BCH_WATERMARK_btree_copygc; + if (watermark < BCH_WATERMARK_btree) + watermark = BCH_WATERMARK_btree; + + flags &= ~BCH_WATERMARK_MASK; + flags |= watermark; + + if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) && + watermark < c->journal.watermark) { + struct journal_res res = { 0 }; + + ret = drop_locks_do(trans, + bch2_journal_res_get(&c->journal, &res, 1, + watermark|JOURNAL_RES_GET_CHECK)); + if (ret) + return ERR_PTR(ret); + } + + while (1) { + nr_nodes[!!update_level] += 1 + split; + update_level++; + + ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + if (ret) + return ERR_PTR(ret); + + if (!btree_path_node(path, update_level)) { + /* Allocating new root? */ + nr_nodes[1] += split; + update_level = BTREE_MAX_DEPTH; + break; + } + + /* + * Always check for space for two keys, even if we won't have to + * split at prior level - it might have been a merge instead: + */ + if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + BKEY_BTREE_PTR_U64s_MAX * 2)) + break; + + split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + } + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + else if (!down_read_trylock(&c->gc_lock)) { + ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); + if (ret) { + up_read(&c->gc_lock); + return ERR_PTR(ret); + } + } + + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); + memset(as, 0, sizeof(*as)); + closure_init(&as->cl, NULL); + as->c = c; + as->start_time = start_time; + as->mode = BTREE_INTERIOR_NO_UPDATE; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->btree_id = path->btree_id; + as->update_level = update_level; + INIT_LIST_HEAD(&as->list); + INIT_LIST_HEAD(&as->unwritten_list); + INIT_LIST_HEAD(&as->write_blocked_list); + bch2_keylist_init(&as->old_keys, as->_old_keys); + bch2_keylist_init(&as->new_keys, as->_new_keys); + bch2_keylist_init(&as->parent_keys, as->inline_keys); + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + /* + * We don't want to allocate if we're in an error state, that can cause + * deadlock on emergency shutdown due to open buckets getting stuck in + * the btree_reserve_cache after allocator shutdown has cleared it out. + * This check needs to come after adding us to the btree_interior_update + * list but before calling bch2_btree_reserve_get, to synchronize with + * __bch2_fs_read_only(). + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + ret = bch2_disk_reservation_get(c, &as->disk_res, + (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), + c->opts.metadata_replicas, + disk_res_flags); + if (ret) + goto err; + + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + if (bch2_err_matches(ret, ENOSPC) || + bch2_err_matches(ret, ENOMEM)) { + struct closure cl; + + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if (bch2_err_matches(ret, ENOSPC) && + (flags & BTREE_INSERT_JOURNAL_RECLAIM) && + watermark != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + goto err; + } + + closure_init_stack(&cl); + + do { + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + + bch2_trans_unlock(trans); + closure_sync(&cl); + } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); + } + + if (ret) { + trace_and_count(c, btree_reserve_get_fail, trans->fn, + _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) + goto err; + + bch2_trans_verify_not_restarted(trans, restart_count); + return as; +err: + bch2_btree_update_free(as, trans); + return ERR_PTR(ret); +} + +/* Btree root updates: */ + +static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) +{ + /* Root nodes cannot be reaped */ + mutex_lock(&c->btree_cache.lock); + list_del_init(&b->list); + mutex_unlock(&c->btree_cache.lock); + + mutex_lock(&c->btree_root_lock); + BUG_ON(btree_node_root(c, b) && + (b->c.level < btree_node_root(c, b)->c.level || + !btree_node_dying(btree_node_root(c, b)))); + + bch2_btree_id_root(c, b->c.btree_id)->b = b; + mutex_unlock(&c->btree_root_lock); + + bch2_recalc_btree_reserve(c); +} + +static void bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = as->c; + struct btree *old; + + trace_and_count(c, btree_node_set_root, c, b); + + old = btree_node_root(c, b); + + /* + * Ensure no one is using the old root while we switch to the + * new root: + */ + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + + bch2_btree_set_root_inmem(c, b); + + btree_update_updated_root(as, b); + + /* + * Unlock old root after new root is visible: + * + * The new root isn't persistent, but that's ok: we still have + * an intent lock on the new root, and any updates that would + * depend on the new root would have to update the new root. + */ + bch2_btree_node_unlock_write(trans, path, old); +} + +/* Interior node updates: */ + +static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + struct bch_fs *c = as->c; + struct bkey_packed *k; + struct printbuf buf = PRINTBUF; + unsigned long old, new, v; + + BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && + !btree_ptr_sectors_written(insert)); + + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + + if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf) ?: + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "inserting invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_printf(&buf, "\n "); + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf); + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf); + + bch2_fs_inconsistent(c, "%s", buf.buf); + dump_stack(); + } + + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); + + as->journal_u64s += + journal_entry_set((void *) &as->journal_entries[as->journal_u64s], + BCH_JSET_ENTRY_btree_keys, + b->c.btree_id, b->c.level, + insert, insert->k.u64s); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && + bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) + bch2_btree_node_iter_advance(node_iter, b); + + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); + set_btree_node_dirty_acct(c, b); + + v = READ_ONCE(b->flags); + do { + old = new = v; + + new &= ~BTREE_WRITE_TYPE_MASK; + new |= BTREE_WRITE_interior; + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + printbuf_exit(&buf); +} + +static void +__bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) +{ + struct bkey_i *insert = bch2_keylist_front(keys); + struct bkey_packed *k; + + BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); + + while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && + (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) + ; + + while (!bch2_keylist_empty(keys)) { + insert = bch2_keylist_front(keys); + + if (bpos_gt(insert->k.p, b->key.k.p)) + break; + + bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); + bch2_keylist_pop_front(keys); + } +} + +/* + * Move keys from n1 (original replacement node, now lower node) to n2 (higher + * node) + */ +static void __btree_split_node(struct btree_update *as, + struct btree_trans *trans, + struct btree *b, + struct btree *n[2]) +{ + struct bkey_packed *k; + struct bpos n1_pos = POS_MIN; + struct btree_node_iter iter; + struct bset *bsets[2]; + struct bkey_format_state format[2]; + struct bkey_packed *out[2]; + struct bkey uk; + unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; + struct { unsigned nr_keys, val_u64s; } nr_keys[2]; + int i; + + memset(&nr_keys, 0, sizeof(nr_keys)); + + for (i = 0; i < 2; i++) { + BUG_ON(n[i]->nsets != 1); + + bsets[i] = btree_bset_first(n[i]); + out[i] = bsets[i]->start; + + SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); + bch2_bkey_format_init(&format[i]); + } + + u64s = 0; + for_each_btree_node_key(b, k, &iter) { + if (bkey_deleted(k)) + continue; + + i = u64s >= n1_u64s; + u64s += k->u64s; + uk = bkey_unpack_key(b, k); + if (!i) + n1_pos = uk.p; + bch2_bkey_format_add_key(&format[i], &uk); + + nr_keys[i].nr_keys++; + nr_keys[i].val_u64s += bkeyp_val_u64s(&b->format, k); + } + + btree_set_min(n[0], b->data->min_key); + btree_set_max(n[0], n1_pos); + btree_set_min(n[1], bpos_successor(n1_pos)); + btree_set_max(n[1], b->data->max_key); + + for (i = 0; i < 2; i++) { + bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); + bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); + + n[i]->data->format = bch2_bkey_format_done(&format[i]); + + unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s + + nr_keys[i].val_u64s; + if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c)) + n[i]->data->format = b->format; + + btree_node_set_format(n[i], n[i]->data->format); + } + + u64s = 0; + for_each_btree_node_key(b, k, &iter) { + if (bkey_deleted(k)) + continue; + + i = u64s >= n1_u64s; + u64s += k->u64s; + + if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) + ? &b->format: &bch2_bkey_format_current, k)) + out[i]->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(b, (void *) out[i], k); + + out[i]->needs_whiteout = false; + + btree_keys_account_key_add(&n[i]->nr, 0, out[i]); + out[i] = bkey_p_next(out[i]); + } + + for (i = 0; i < 2; i++) { + bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); + + BUG_ON(!bsets[i]->u64s); + + set_btree_bset_end(n[i], n[i]->set); + + btree_node_reset_sib_u64s(n[i]); + + bch2_verify_btree_nr_keys(n[i]); + + if (b->c.level) + btree_node_interior_verify(as->c, n[i]); + } +} + +/* + * For updates to interior nodes, we've got to do the insert before we split + * because the stuff we're inserting has to be inserted atomically. Post split, + * the keys might have to go in different nodes and the split would no longer be + * atomic. + * + * Worse, if the insert is from btree node coalescing, if we do the insert after + * we do the split (and pick the pivot) - the pivot we pick might be between + * nodes that were coalesced, and thus in the middle of a child node post + * coalescing: + */ +static void btree_split_insert_keys(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct keylist *keys) +{ + if (!bch2_keylist_empty(keys) && + bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { + struct btree_node_iter node_iter; + + bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); + + __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + + btree_node_interior_verify(as->c, b); + } +} + +static int btree_split(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) +{ + struct bch_fs *c = as->c; + struct btree *parent = btree_node_parent(path, b); + struct btree *n1, *n2 = NULL, *n3 = NULL; + struct btree_path *path1 = NULL, *path2 = NULL; + u64 start_time = local_clock(); + int ret = 0; + + BUG_ON(!parent && (b != btree_node_root(c, b))); + BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); + + bch2_btree_interior_update_will_free_node(as, b); + + if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { + struct btree *n[2]; + + trace_and_count(c, btree_node_split, c, b); + + n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); + n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); + + __btree_split_node(as, trans, b, n); + + if (keys) { + btree_split_insert_keys(as, trans, path, n1, keys); + btree_split_insert_keys(as, trans, path, n2, keys); + BUG_ON(!bch2_keylist_empty(keys)); + } + + bch2_btree_build_aux_trees(n2); + bch2_btree_build_aux_trees(n1); + + bch2_btree_update_add_new_node(as, n1); + bch2_btree_update_add_new_node(as, n2); + six_unlock_write(&n2->c.lock); + six_unlock_write(&n1->c.lock); + + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path1, n1); + + path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + six_lock_increment(&n2->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path2, n2); + + /* + * Note that on recursive parent_keys == keys, so we + * can't start adding new keys to parent_keys before emptying it + * out (which we did with btree_split_insert_keys() above) + */ + bch2_keylist_add(&as->parent_keys, &n1->key); + bch2_keylist_add(&as->parent_keys, &n2->key); + + if (!parent) { + /* Depth increases, make a new root */ + n3 = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n3); + six_unlock_write(&n3->c.lock); + + path2->locks_want++; + BUG_ON(btree_node_locked(path2, n3->c.level)); + six_lock_increment(&n3->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path2, n3); + + n3->sib_u64s[0] = U16_MAX; + n3->sib_u64s[1] = U16_MAX; + + btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); + } + } else { + trace_and_count(c, btree_node_compact, c, b); + + n1 = bch2_btree_node_alloc_replacement(as, trans, b); + + if (keys) { + btree_split_insert_keys(as, trans, path, n1, keys); + BUG_ON(!bch2_keylist_empty(keys)); + } + + bch2_btree_build_aux_trees(n1); + bch2_btree_update_add_new_node(as, n1); + six_unlock_write(&n1->c.lock); + + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path1, n1); + + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); + } + + /* New nodes all written, now make them visible: */ + + if (parent) { + /* Split a non root node */ + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + if (ret) + goto err; + } else if (n3) { + bch2_btree_set_root(as, trans, path, n3); + } else { + /* Root filled up but didn't need to be split */ + bch2_btree_set_root(as, trans, path, n1); + } + + if (n3) { + bch2_btree_update_get_open_buckets(as, n3); + bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + } + if (n2) { + bch2_btree_update_get_open_buckets(as, n2); + bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); + } + bch2_btree_update_get_open_buckets(as, n1); + bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + + /* + * The old node must be freed (in memory) _before_ unlocking the new + * nodes - else another thread could re-acquire a read lock on the old + * node after another thread has locked and updated the new node, thus + * seeing stale data: + */ + bch2_btree_node_free_inmem(trans, path, b); + + if (n3) + bch2_trans_node_add(trans, n3); + if (n2) + bch2_trans_node_add(trans, n2); + bch2_trans_node_add(trans, n1); + + if (n3) + six_unlock_intent(&n3->c.lock); + if (n2) + six_unlock_intent(&n2->c.lock); + six_unlock_intent(&n1->c.lock); +out: + if (path2) { + __bch2_btree_path_unlock(trans, path2); + bch2_path_put(trans, path2, true); + } + if (path1) { + __bch2_btree_path_unlock(trans, path1); + bch2_path_put(trans, path1, true); + } + + bch2_trans_verify_locks(trans); + + bch2_time_stats_update(&c->times[n2 + ? BCH_TIME_btree_node_split + : BCH_TIME_btree_node_compact], + start_time); + return ret; +err: + if (n3) + bch2_btree_node_free_never_used(as, trans, n3); + if (n2) + bch2_btree_node_free_never_used(as, trans, n2); + bch2_btree_node_free_never_used(as, trans, n1); + goto out; +} + +static void +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct keylist *keys) +{ + struct btree_path *linked; + + __bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + btree_update_updated_node(as, b); + + trans_for_each_path_with_node(trans, b, linked) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + + bch2_trans_verify_paths(trans); +} + +/** + * bch2_btree_insert_node - insert bkeys into a given btree node + * + * @as: btree_update object + * @trans: btree_trans object + * @path: path that points to current node + * @b: node to insert keys into + * @keys: list of keys to insert + * @flags: transaction commit flags + * + * Returns: 0 on success, typically transaction restart error on failure + * + * Inserts as many keys as it can into a given btree node, splitting it if full. + * If a split occurred, this function will return early. This can only happen + * for leaf nodes -- inserts into interior nodes have to be atomic. + */ +static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) +{ + struct bch_fs *c = as->c; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + int ret; + + lockdep_assert_held(&c->gc_lock); + BUG_ON(!btree_node_intent_locked(path, b->c.level)); + BUG_ON(!b->c.level); + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + + ret = bch2_btree_node_lock_write(trans, path, &b->c); + if (ret) + return ret; + + bch2_btree_node_prep_for_write(trans, path, b); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + bch2_btree_node_unlock_write(trans, path, b); + goto split; + } + + btree_node_interior_verify(c, b); + + bch2_btree_insert_keys_interior(as, trans, path, b, keys); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); + + bch2_btree_node_unlock_write(trans, path, b); + + btree_node_interior_verify(c, b); + return 0; +split: + /* + * We could attempt to avoid the transaction restart, by calling + * bch2_btree_path_upgrade() and allocating more nodes: + */ + if (b->c.level >= as->update_level) { + trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); + } + + return btree_split(as, trans, path, b, keys, flags); +} + +int bch2_btree_split_leaf(struct btree_trans *trans, + struct btree_path *path, + unsigned flags) +{ + struct btree *b = path_l(path)->b; + struct btree_update *as; + unsigned l; + int ret = 0; + + as = bch2_btree_update_start(trans, path, path->level, + true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + ret = btree_split(as, trans, path, b, NULL, flags); + if (ret) { + bch2_btree_update_free(as, trans); + return ret; + } + + bch2_btree_update_done(as, trans); + + for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) + ret = bch2_foreground_maybe_merge(trans, path, l, flags); + + return ret; +} + +int __bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) +{ + struct bch_fs *c = trans->c; + struct btree_path *sib_path = NULL, *new_path = NULL; + struct btree_update *as; + struct bkey_format_state new_s; + struct bkey_format new_f; + struct bkey_i delete; + struct btree *b, *m, *n, *prev, *next, *parent; + struct bpos sib_pos; + size_t sib_u64s; + u64 start_time = local_clock(); + int ret = 0; + + BUG_ON(!path->should_be_locked); + BUG_ON(!btree_node_locked(path, level)); + + b = path->l[level].b; + + if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { + b->sib_u64s[sib] = U16_MAX; + return 0; + } + + sib_pos = sib == btree_prev_sib + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); + + sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, sib_path, false); + if (ret) + goto err; + + btree_path_set_should_be_locked(sib_path); + + m = sib_path->l[level].b; + + if (btree_node_parent(path, b) != + btree_node_parent(sib_path, m)) { + b->sib_u64s[sib] = U16_MAX; + goto out; + } + + if (sib == btree_prev_sib) { + prev = m; + next = b; + } else { + prev = b; + next = m; + } + + if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + bch2_bpos_to_text(&buf1, prev->data->max_key); + bch2_bpos_to_text(&buf2, next->data->min_key); + bch_err(c, + "%s(): btree topology error:\n" + " prev ends at %s\n" + " next starts at %s", + __func__, buf1.buf, buf2.buf); + printbuf_exit(&buf1); + printbuf_exit(&buf2); + bch2_topology_error(c); + ret = -EIO; + goto err; + } + + bch2_bkey_format_init(&new_s); + bch2_bkey_format_add_pos(&new_s, prev->data->min_key); + __bch2_btree_calc_format(&new_s, prev); + __bch2_btree_calc_format(&new_s, next); + bch2_bkey_format_add_pos(&new_s, next->data->max_key); + new_f = bch2_bkey_format_done(&new_s); + + sib_u64s = btree_node_u64s_with_format(b->nr, &b->format, &new_f) + + btree_node_u64s_with_format(m->nr, &m->format, &new_f); + + if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { + sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + sib_u64s /= 2; + sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + } + + sib_u64s = min(sib_u64s, btree_max_u64s(c)); + sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); + b->sib_u64s[sib] = sib_u64s; + + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) + goto out; + + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, level, false, + BTREE_INSERT_NOFAIL|flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto err; + + trace_and_count(c, btree_node_merge, c, b); + + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); + + n = bch2_btree_node_alloc(as, trans, b->c.level); + + SET_BTREE_NODE_SEQ(n->data, + max(BTREE_NODE_SEQ(b->data), + BTREE_NODE_SEQ(m->data)) + 1); + + btree_set_min(n, prev->data->min_key); + btree_set_max(n, next->data->max_key); + + n->data->format = new_f; + btree_node_set_format(n, new_f); + + bch2_btree_sort_into(c, n, prev); + bch2_btree_sort_into(c, n, next); + + bch2_btree_build_aux_trees(n); + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, new_path, n); + + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; + bch2_keylist_add(&as->parent_keys, &delete); + bch2_keylist_add(&as->parent_keys, &n->key); + + bch2_trans_verify_paths(trans); + + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + if (ret) + goto err_free_update; + + bch2_trans_verify_paths(trans); + + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, sib_path, m); + + bch2_trans_node_add(trans, n); + + bch2_trans_verify_paths(trans); + + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as, trans); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); +out: +err: + if (new_path) + bch2_path_put(trans, new_path, true); + bch2_path_put(trans, sib_path, true); + bch2_trans_verify_locks(trans); + return ret; +err_free_update: + bch2_btree_node_free_never_used(as, trans, n); + bch2_btree_update_free(as, trans); + goto out; +} + +int bch2_btree_node_rewrite(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_path *new_path = NULL; + struct btree *n, *parent; + struct btree_update *as; + int ret; + + flags |= BTREE_INSERT_NOFAIL; + + parent = btree_node_parent(iter->path, b); + as = bch2_btree_update_start(trans, iter->path, b->c.level, + false, flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto out; + + bch2_btree_interior_update_will_free_node(as, b); + + n = bch2_btree_node_alloc_replacement(as, trans, b); + + bch2_btree_build_aux_trees(n); + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, new_path, n); + + trace_and_count(c, btree_node_rewrite, c, b); + + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, + &as->parent_keys, flags); + if (ret) + goto err; + } else { + bch2_btree_set_root(as, trans, iter->path, n); + } + + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, iter->path, b); + + bch2_trans_node_add(trans, n); + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as, trans); +out: + if (new_path) + bch2_path_put(trans, new_path, true); + bch2_trans_downgrade(trans); + return ret; +err: + bch2_btree_node_free_never_used(as, trans, n); + bch2_btree_update_free(as, trans); + goto out; +} + +struct async_btree_rewrite { + struct bch_fs *c; + struct work_struct work; + struct list_head list; + enum btree_id btree_id; + unsigned level; + struct bpos pos; + __le64 seq; +}; + +static int async_btree_node_rewrite_trans(struct btree_trans *trans, + struct async_btree_rewrite *a) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct btree *b; + int ret; + + bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, + BTREE_MAX_DEPTH, a->level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + + if (!b || b->data->keys.seq != a->seq) { + struct printbuf buf = PRINTBUF; + + if (b) + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + else + prt_str(&buf, "(null"); + bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", + __func__, a->seq, buf.buf); + printbuf_exit(&buf); + goto out; + } + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +out: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static void async_btree_node_rewrite_work(struct work_struct *work) +{ + struct async_btree_rewrite *a = + container_of(work, struct async_btree_rewrite, work); + struct bch_fs *c = a->c; + int ret; + + ret = bch2_trans_do(c, NULL, NULL, 0, + async_btree_node_rewrite_trans(trans, a)); + if (ret) + bch_err_fn(c, ret); + bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + kfree(a); +} + +void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) +{ + struct async_btree_rewrite *a; + int ret; + + a = kmalloc(sizeof(*a), GFP_NOFS); + if (!a) { + bch_err(c, "%s: error allocating memory", __func__); + return; + } + + a->c = c; + a->btree_id = b->c.btree_id; + a->level = b->c.level; + a->pos = b->key.k.p; + a->seq = b->data->keys.seq; + INIT_WORK(&a->work, async_btree_node_rewrite_work); + + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + mutex_lock(&c->pending_node_rewrites_lock); + list_add(&a->list, &c->pending_node_rewrites); + mutex_unlock(&c->pending_node_rewrites_lock); + return; + } + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (test_bit(BCH_FS_STARTED, &c->flags)) { + bch_err(c, "%s: error getting c->writes ref", __func__); + kfree(a); + return; + } + + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err_msg(c, ret, "going read-write"); + kfree(a); + return; + } + + bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + } + + queue_work(c->btree_interior_update_worker, &a->work); +} + +void bch2_do_pending_node_rewrites(struct bch_fs *c) +{ + struct async_btree_rewrite *a, *n; + + mutex_lock(&c->pending_node_rewrites_lock); + list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { + list_del(&a->list); + + bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + queue_work(c->btree_interior_update_worker, &a->work); + } + mutex_unlock(&c->pending_node_rewrites_lock); +} + +void bch2_free_pending_node_rewrites(struct bch_fs *c) +{ + struct async_btree_rewrite *a, *n; + + mutex_lock(&c->pending_node_rewrites_lock); + list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { + list_del(&a->list); + + kfree(a); + } + mutex_unlock(&c->pending_node_rewrites_lock); +} + +static int __bch2_btree_node_update_key(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, struct btree *new_hash, + struct bkey_i *new_key, + unsigned commit_flags, + bool skip_triggers) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter2 = { NULL }; + struct btree *parent; + int ret; + + if (!skip_triggers) { + ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), 0); + if (ret) + return ret; + + ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, + new_key, 0); + if (ret) + return ret; + } + + if (new_hash) { + bkey_copy(&new_hash->key, new_key); + ret = bch2_btree_node_hash_insert(&c->btree_cache, + new_hash, b->c.level, b->c.btree_id); + BUG_ON(ret); + } + + parent = btree_node_parent(iter->path, b); + if (parent) { + bch2_trans_copy_iter(&iter2, iter); + + iter2.path = bch2_btree_path_make_mut(trans, iter2.path, + iter2.flags & BTREE_ITER_INTENT, + _THIS_IP_); + + BUG_ON(iter2.path->level != b->c.level); + BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); + + btree_path_set_level_up(trans, iter2.path); + + trans->paths_sorted = false; + + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + } else { + BUG_ON(btree_node_root(c, b) != b); + + ret = darray_make_room(&trans->extra_journal_entries, + jset_u64s(new_key->k.u64s)); + if (ret) + return ret; + + journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + BCH_JSET_ENTRY_btree_root, + b->c.btree_id, b->c.level, + new_key, new_key->k.u64s); + trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); + } + + ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); + if (ret) + goto err; + + bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, new_hash); + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new_key); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + } else { + bkey_copy(&b->key, new_key); + } + + bch2_btree_node_unlock_write(trans, iter->path, b); +out: + bch2_trans_iter_exit(trans, &iter2); + return ret; +err: + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + } + goto out; +} + +int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, struct bkey_i *new_key, + unsigned commit_flags, bool skip_triggers) +{ + struct bch_fs *c = trans->c; + struct btree *new_hash = NULL; + struct btree_path *path = iter->path; + struct closure cl; + int ret = 0; + + ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); + if (ret) + return ret; + + closure_init_stack(&cl); + + /* + * check btree_ptr_hash_val() after @b is locked by + * btree_iter_traverse(): + */ + if (btree_ptr_hash_val(new_key) != b->hash_val) { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) { + ret = drop_locks_do(trans, (closure_sync(&cl), 0)); + if (ret) + return ret; + } + + new_hash = bch2_btree_node_mem_alloc(trans, false); + } + + path->intent_ref++; + ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, + commit_flags, skip_triggers); + --path->intent_ref; + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + list_move(&new_hash->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + six_unlock_write(&new_hash->c.lock); + six_unlock_intent(&new_hash->c.lock); + } + closure_sync(&cl); + bch2_btree_cache_cannibalize_unlock(c); + return ret; +} + +int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, + struct btree *b, struct bkey_i *new_key, + unsigned commit_flags, bool skip_triggers) +{ + struct btree_iter iter; + int ret; + + bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + /* has node been freed? */ + if (iter.path->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + goto out; + } + + BUG_ON(!btree_node_hashed(b)); + + struct bch_extent_ptr *ptr; + bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, + !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); + + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, + commit_flags, skip_triggers); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* Init code: */ + +/* + * Only for filesystem bringup, when first reading the btree roots or allocating + * btree roots when initializing a new filesystem: + */ +void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) +{ + BUG_ON(btree_node_root(c, b)); + + bch2_btree_set_root_inmem(c, b); +} + +static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +{ + struct bch_fs *c = trans->c; + struct closure cl; + struct btree *b; + int ret; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + + b = bch2_btree_node_mem_alloc(trans, false); + bch2_btree_cache_cannibalize_unlock(c); + + set_btree_node_fake(b); + set_btree_node_need_rewrite(b); + b->c.level = 0; + b->c.btree_id = id; + + bkey_btree_ptr_init(&b->key); + b->key.k.p = SPOS_MAX; + *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; + + bch2_bset_init_first(b, &b->data->keys); + bch2_btree_build_aux_trees(b); + + b->data->flags = 0; + btree_set_min(b, POS_MIN); + btree_set_max(b, SPOS_MAX); + b->data->format = bch2_btree_calc_format(b); + btree_node_set_format(b, b->data->format); + + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, + b->c.level, b->c.btree_id); + BUG_ON(ret); + + bch2_btree_set_root_inmem(c, b); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + return 0; +} + +void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +{ + bch2_trans_run(c, __bch2_btree_root_alloc(trans, id)); +} + +void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct btree_update *as; + + mutex_lock(&c->btree_interior_update_lock); + list_for_each_entry(as, &c->btree_interior_update_list, list) + prt_printf(out, "%p m %u w %u r %u j %llu\n", + as, + as->mode, + as->nodes_written, + closure_nr_remaining(&as->cl), + as->journal.seq); + mutex_unlock(&c->btree_interior_update_lock); +} + +static bool bch2_btree_interior_updates_pending(struct bch_fs *c) +{ + bool ret; + + mutex_lock(&c->btree_interior_update_lock); + ret = !list_empty(&c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + return ret; +} + +bool bch2_btree_interior_updates_flush(struct bch_fs *c) +{ + bool ret = bch2_btree_interior_updates_pending(c); + + if (ret) + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_pending(c)); + return ret; +} + +void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) +{ + struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); + + mutex_lock(&c->btree_root_lock); + + r->level = entry->level; + r->alive = true; + bkey_copy(&r->key, (struct bkey_i *) entry->start); + + mutex_unlock(&c->btree_root_lock); +} + +struct jset_entry * +bch2_btree_roots_to_journal_entries(struct bch_fs *c, + struct jset_entry *end, + unsigned long skip) +{ + unsigned i; + + mutex_lock(&c->btree_root_lock); + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->alive && !test_bit(i, &skip)) { + journal_entry_set(end, BCH_JSET_ENTRY_btree_root, + i, r->level, &r->key, r->key.k.u64s); + end = vstruct_next(end); + } + } + + mutex_unlock(&c->btree_root_lock); + + return end; +} + +void bch2_fs_btree_interior_update_exit(struct bch_fs *c) +{ + if (c->btree_interior_update_worker) + destroy_workqueue(c->btree_interior_update_worker); + mempool_exit(&c->btree_interior_update_pool); +} + +void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) +{ + mutex_init(&c->btree_reserve_cache_lock); + INIT_LIST_HEAD(&c->btree_interior_update_list); + INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); + mutex_init(&c->btree_interior_update_lock); + INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); + + INIT_LIST_HEAD(&c->pending_node_rewrites); + mutex_init(&c->pending_node_rewrites_lock); +} + +int bch2_fs_btree_interior_update_init(struct bch_fs *c) +{ + c->btree_interior_update_worker = + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + if (!c->btree_interior_update_worker) + return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, + sizeof(struct btree_update))) + return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + + return 0; +} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h new file mode 100644 index 000000000000..a6668992a272 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.h @@ -0,0 +1,332 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H + +#include "btree_cache.h" +#include "btree_locking.h" +#include "btree_update.h" + +#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) + +#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) + +/* + * Tracks an in progress split/rewrite of a btree node and the update to the + * parent node: + * + * When we split/rewrite a node, we do all the updates in memory without + * waiting for any writes to complete - we allocate the new node(s) and update + * the parent node, possibly recursively up to the root. + * + * The end result is that we have one or more new nodes being written - + * possibly several, if there were multiple splits - and then a write (updating + * an interior node) which will make all these new nodes visible. + * + * Additionally, as we split/rewrite nodes we free the old nodes - but the old + * nodes can't be freed (their space on disk can't be reclaimed) until the + * update to the interior node that makes the new node visible completes - + * until then, the old nodes are still reachable on disk. + * + */ +struct btree_update { + struct closure cl; + struct bch_fs *c; + u64 start_time; + + struct list_head list; + struct list_head unwritten_list; + + /* What kind of update are we doing? */ + enum { + BTREE_INTERIOR_NO_UPDATE, + BTREE_INTERIOR_UPDATING_NODE, + BTREE_INTERIOR_UPDATING_ROOT, + BTREE_INTERIOR_UPDATING_AS, + } mode; + + unsigned nodes_written:1; + unsigned took_gc_lock:1; + + enum btree_id btree_id; + unsigned update_level; + + struct disk_reservation disk_res; + + /* + * BTREE_INTERIOR_UPDATING_NODE: + * The update that made the new nodes visible was a regular update to an + * existing interior node - @b. We can't write out the update to @b + * until the new nodes we created are finished writing, so we block @b + * from writing by putting this btree_interior update on the + * @b->write_blocked list with @write_blocked_list: + */ + struct btree *b; + struct list_head write_blocked_list; + + /* + * We may be freeing nodes that were dirty, and thus had journal entries + * pinned: we need to transfer the oldest of those pins to the + * btree_update operation, and release it when the new node(s) + * are all persistent and reachable: + */ + struct journal_entry_pin journal; + + /* Preallocated nodes we reserve when we start the update: */ + struct prealloc_nodes { + struct btree *b[BTREE_UPDATE_NODES_MAX]; + unsigned nr; + } prealloc_nodes[2]; + + /* Nodes being freed: */ + struct keylist old_keys; + u64 _old_keys[BTREE_UPDATE_NODES_MAX * + BKEY_BTREE_PTR_U64s_MAX]; + + /* Nodes being added: */ + struct keylist new_keys; + u64 _new_keys[BTREE_UPDATE_NODES_MAX * + BKEY_BTREE_PTR_U64s_MAX]; + + /* New nodes, that will be made reachable by this update: */ + struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; + unsigned nr_new_nodes; + + struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; + __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; + unsigned nr_old_nodes; + + open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * + BCH_REPLICAS_MAX]; + open_bucket_idx_t nr_open_buckets; + + unsigned journal_u64s; + u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; + + /* Only here to reduce stack usage on recursive splits: */ + struct keylist parent_keys; + /* + * Enough room for btree_split's keys without realloc - btree node + * pointers never have crc/compression info, so we only need to acount + * for the pointers for three keys + */ + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; +}; + +struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree_trans *, + struct btree *, + struct bkey_format); + +int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); + +int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, + unsigned, unsigned, enum btree_node_sibling); + +static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, + struct btree_path *path, + unsigned level, unsigned flags, + enum btree_node_sibling sib) +{ + struct btree *b; + + EBUG_ON(!btree_node_locked(path, level)); + + b = path->l[level].b; + if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) + return 0; + + return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); +} + +static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + unsigned flags) +{ + return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, + btree_prev_sib) ?: + bch2_foreground_maybe_merge_sibling(trans, path, level, flags, + btree_next_sib); +} + +int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); +void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); +int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *, + unsigned, bool); +int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, + struct bkey_i *, unsigned, bool); + +void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); +void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); + +static inline unsigned btree_update_reserve_required(struct bch_fs *c, + struct btree *b) +{ + unsigned depth = btree_node_root(c, b)->c.level + 1; + + /* + * Number of nodes we might have to allocate in a worst case btree + * split operation - we split all the way up to the root, then allocate + * a new root, unless we're already at max depth: + */ + if (depth < BTREE_MAX_DEPTH) + return (depth - b->c.level) * 2 + 1; + else + return (depth - b->c.level) * 2 - 1; +} + +static inline void btree_node_reset_sib_u64s(struct btree *b) +{ + b->sib_u64s[0] = b->nr.live_u64s; + b->sib_u64s[1] = b->nr.live_u64s; +} + +static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +{ + return (void *) b->data + btree_bytes(c); +} + +static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, + struct btree *b) +{ + return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); +} + +static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, + struct btree *b) +{ + return btree_data_end(c, b); +} + +static inline void *write_block(struct btree *b) +{ + return (void *) b->data + (b->written << 9); +} + +static inline bool __btree_addr_written(struct btree *b, void *p) +{ + return p < write_block(b); +} + +static inline bool bset_written(struct btree *b, struct bset *i) +{ + return __btree_addr_written(b, i); +} + +static inline bool bkey_written(struct btree *b, struct bkey_packed *k) +{ + return __btree_addr_written(b, k); +} + +static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, + struct btree *b, + void *end) +{ + ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + + b->whiteout_u64s; + ssize_t total = c->opts.btree_node_size >> 3; + + /* Always leave one extra u64 for bch2_varint_decode: */ + used++; + + return total - used; +} + +static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, + struct btree *b) +{ + ssize_t remaining = __bch_btree_u64s_remaining(c, b, + btree_bkey_last(b, bset_tree_last(b))); + + BUG_ON(remaining < 0); + + if (bset_written(b, btree_bset_last(b))) + return 0; + + return remaining; +} + +#define BTREE_WRITE_SET_U64s_BITS 9 + +static inline unsigned btree_write_set_buffer(struct btree *b) +{ + /* + * Could buffer up larger amounts of keys for btrees with larger keys, + * pending benchmarking: + */ + return 8 << BTREE_WRITE_SET_U64s_BITS; +} + +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, + struct btree *b) +{ + struct bset_tree *t = bset_tree_last(b); + struct btree_node_entry *bne = max(write_block(b), + (void *) btree_bkey_last(b, bset_tree_last(b))); + ssize_t remaining_space = + __bch_btree_u64s_remaining(c, b, bne->keys.start); + + if (unlikely(bset_written(b, bset(b, t)))) { + if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + return bne; + } else { + if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && + remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) + return bne; + } + + return NULL; +} + +static inline void push_whiteout(struct bch_fs *c, struct btree *b, + struct bpos pos) +{ + struct bkey_packed k; + + BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + EBUG_ON(btree_node_just_written(b)); + + if (!bkey_pack_pos(&k, pos, b)) { + struct bkey *u = (void *) &k; + + bkey_init(u); + u->p = pos; + } + + k.needs_whiteout = true; + + b->whiteout_u64s += k.u64s; + bkey_p_copy(unwritten_whiteouts_start(c, b), &k); +} + +/* + * write lock must be held on @b (else the dirty bset that we were going to + * insert into could be written out from under us) + */ +static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, + struct btree *b, unsigned u64s) +{ + if (unlikely(btree_node_need_rewrite(b))) + return false; + + return u64s <= bch_btree_keys_u64s_remaining(c, b); +} + +void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); + +bool bch2_btree_interior_updates_flush(struct bch_fs *); + +void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); +struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, + struct jset_entry *, unsigned long); + +void bch2_do_pending_node_rewrites(struct bch_fs *); +void bch2_free_pending_node_rewrites(struct bch_fs *); + +void bch2_fs_btree_interior_update_exit(struct bch_fs *); +void bch2_fs_btree_interior_update_init_early(struct bch_fs *); +int bch2_fs_btree_interior_update_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c new file mode 100644 index 000000000000..4e6241db518b --- /dev/null +++ b/fs/bcachefs/btree_write_buffer.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" + +#include <linux/sort.h> + +static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->btree, r->btree) ?: + bpos_cmp(l->k.k.p, r->k.k.p) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + +static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb, + unsigned commit_flags, + bool *write_locked, + size_t *fast) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; + int ret; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + path = iter->path; + + if (!*write_locked) { + ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); + if (ret) + return ret; + + bch2_btree_node_prep_for_write(trans, path, path->l[0].b); + *write_locked = true; + } + + if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + *write_locked = false; + goto trans_commit; + } + + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); + (*fast)++; + + if (path->ref > 1) { + /* + * We can't clone a path that has write locks: if the path is + * shared, unlock before set_pos(), traverse(): + */ + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + *write_locked = false; + } + return 0; +trans_commit: + return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + commit_flags| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RECLAIM); +} + +static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) +{ + union btree_write_buffer_state old, new; + u64 v = READ_ONCE(wb->state.v); + + do { + old.v = new.v = v; + + new.nr = 0; + new.idx++; + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + + while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) + cpu_relax(); + + smp_mb(); + + return old; +} + +/* + * Update a btree with a write buffered key using the journal seq of the + * original write buffer insert. + * + * It is not safe to rejournal the key once it has been inserted into the write + * buffer because that may break recovery ordering. For example, the key may + * have already been modified in the active write buffer in a seq that comes + * before the current transaction. If we were to journal this key again and + * crash, recovery would process updates in the wrong order. + */ +static int +btree_write_buffered_insert(struct btree_trans *trans, + struct btree_write_buffered_key *wb) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, + bool locked) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct journal_entry_pin pin; + struct btree_write_buffered_key *i, *keys; + struct btree_iter iter = { NULL }; + size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + bool write_locked = false; + union btree_write_buffer_state s; + int ret = 0; + + memset(&pin, 0, sizeof(pin)); + + if (!locked && !mutex_trylock(&wb->flush_lock)) + return 0; + + bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); + bch2_journal_pin_drop(j, &wb->journal_pin); + + s = btree_write_buffer_switch(wb); + keys = wb->keys[s.idx]; + nr = s.nr; + + if (race_fault()) + goto slowpath; + + /* + * We first sort so that we can detect and skip redundant updates, and + * then we attempt to flush in sorted btree order, as this is most + * efficient. + * + * However, since we're not flushing in the order they appear in the + * journal we won't be able to drop our journal pin until everything is + * flushed - which means this could deadlock the journal if we weren't + * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail + * if it would block taking a journal reservation. + * + * If that happens, simply skip the key so we can optimistically insert + * as many keys as possible in the fast path. + */ + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_key_cmp, NULL); + + for (i = keys; i < keys + nr; i++) { + if (i + 1 < keys + nr && + i[0].btree == i[1].btree && + bpos_eq(i[0].k.k.p, i[1].k.k.p)) { + skipped++; + i->journal_seq = 0; + continue; + } + + if (write_locked && + (iter.path->btree_id != i->btree || + bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { + bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + write_locked = false; + } + + if (!iter.path || iter.path->btree_id != i->btree) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); + } + + bch2_btree_iter_set_pos(&iter, i->k.k.p); + iter.path->preserve = false; + + do { + ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, + commit_flags, &write_locked, &fast); + if (!write_locked) + bch2_trans_begin(trans); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + slowpath++; + continue; + } + if (ret) + break; + + i->journal_seq = 0; + } + + if (write_locked) + bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + bch2_trans_iter_exit(trans, &iter); + + trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); + + if (slowpath) + goto slowpath; + + bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); +out: + bch2_journal_pin_drop(j, &pin); + mutex_unlock(&wb->flush_lock); + return ret; +slowpath: + trace_write_buffer_flush_slowpath(trans, i - keys, nr); + + /* + * Now sort the rest by journal seq and bump the journal pin as we go. + * The slowpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_journal_cmp, + NULL); + + commit_flags &= ~BCH_WATERMARK_MASK; + commit_flags |= BCH_WATERMARK_reclaim; + + for (i = keys; i < keys + nr; i++) { + if (!i->journal_seq) + continue; + + if (i->journal_seq > pin.seq) { + struct journal_entry_pin pin2; + + memset(&pin2, 0, sizeof(pin2)); + + bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); + bch2_journal_pin_drop(j, &pin); + bch2_journal_pin_copy(j, &pin, &pin2, NULL); + bch2_journal_pin_drop(j, &pin2); + } + + ret = commit_do(trans, NULL, NULL, + commit_flags| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RECLAIM, + btree_write_buffered_insert(trans, i)); + if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) + break; + } + + goto out; +} + +int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +{ + bch2_trans_unlock(trans); + mutex_lock(&trans->c->btree_write_buffer.flush_lock); + return __bch2_btree_write_buffer_flush(trans, 0, true); +} + +int bch2_btree_write_buffer_flush(struct btree_trans *trans) +{ + return __bch2_btree_write_buffer_flush(trans, 0, false); +} + +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write_buffer *wb = &c->btree_write_buffer; + + mutex_lock(&wb->flush_lock); + + return bch2_trans_run(c, + __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true)); +} + +static inline u64 btree_write_buffer_ref(int idx) +{ + return ((union btree_write_buffer_state) { + .ref0 = idx == 0, + .ref1 = idx == 1, + }).v; +} + +int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key *i; + union btree_write_buffer_state old, new; + int ret = 0; + u64 v; + + trans_for_each_wb_update(trans, i) { + EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + i->journal_seq = trans->journal_res.seq; + i->journal_offset = trans->journal_res.offset; + } + + preempt_disable(); + v = READ_ONCE(wb->state.v); + do { + old.v = new.v = v; + + new.v += btree_write_buffer_ref(new.idx); + new.nr += trans->nr_wb_updates; + if (new.nr > wb->size) { + ret = -BCH_ERR_btree_insert_need_flush_buffer; + goto out; + } + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + + memcpy(wb->keys[new.idx] + old.nr, + trans->wb_updates, + sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_btree_write_buffer_journal_flush); + + atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); +out: + preempt_enable(); + return ret; +} + +void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + + kvfree(wb->keys[1]); + kvfree(wb->keys[0]); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + mutex_init(&wb->flush_lock); + wb->size = c->opts.btree_write_buffer_size; + + wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); + wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); + if (!wb->keys[0] || !wb->keys[1]) + return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + + return 0; +} diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h new file mode 100644 index 000000000000..322df1c8304e --- /dev/null +++ b/fs/bcachefs/btree_write_buffer.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H +#define _BCACHEFS_BTREE_WRITE_BUFFER_H + +int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); +int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +int bch2_btree_write_buffer_flush(struct btree_trans *); + +int bch2_btree_insert_keys_write_buffer(struct btree_trans *); + +void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +int bch2_fs_btree_write_buffer_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h new file mode 100644 index 000000000000..99993ba77aea --- /dev/null +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H + +#include "journal_types.h" + +#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 +#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) + +struct btree_write_buffered_key { + u64 journal_seq; + unsigned journal_offset; + enum btree_id btree; + __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); +}; + +union btree_write_buffer_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 nr:23; + u64 idx:1; + u64 ref0:20; + u64 ref1:20; + }; +}; + +struct btree_write_buffer { + struct mutex flush_lock; + struct journal_entry_pin journal_pin; + + union btree_write_buffer_state state; + size_t size; + + struct btree_write_buffered_key *keys[2]; +}; + +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 index 000000000000..5a91d3189fcf --- /dev/null +++ b/fs/bcachefs/buckets.c @@ -0,0 +1,2170 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. + * + * Copyright 2014 Datera, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "bset.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "buckets_waiting_for_journal.h" +#include "ec.h" +#include "error.h" +#include "inode.h" +#include "movinggc.h" +#include "recovery.h" +#include "reflink.h" +#include "replicas.h" +#include "subvolume.h" +#include "trace.h" + +#include <linux/preempt.h> + +static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, + enum bch_data_type data_type, + s64 sectors) +{ + switch (data_type) { + case BCH_DATA_btree: + fs_usage->btree += sectors; + break; + case BCH_DATA_user: + case BCH_DATA_parity: + fs_usage->data += sectors; + break; + case BCH_DATA_cached: + fs_usage->cached += sectors; + break; + default: + break; + } +} + +void bch2_fs_usage_initialize(struct bch_fs *c) +{ + struct bch_fs_usage *usage; + struct bch_dev *ca; + unsigned i; + + percpu_down_write(&c->mark_lock); + usage = c->usage_base; + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + usage->reserved += usage->persistent_reserved[i]; + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + } + + for_each_member_device(ca, c, i) { + struct bch_dev_usage dev = bch2_dev_usage_read(ca); + + usage->hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * + ca->mi.bucket_size; + } + + percpu_up_write(&c->mark_lock); +} + +static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) +{ + BUG_ON(!gc && !journal_seq); + + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +} + +void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) +{ + struct bch_fs *c = ca->fs; + unsigned seq, i, u64s = dev_usage_u64s(); + + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(usage, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); +} + +u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) +{ + ssize_t offset = v - (u64 *) c->usage_base; + unsigned i, seq; + u64 ret; + + BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); + percpu_rwsem_assert_held(&c->mark_lock); + + do { + seq = read_seqcount_begin(&c->usage_lock); + ret = *v; + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +} + +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) +{ + struct bch_fs_usage_online *ret; + unsigned nr_replicas = READ_ONCE(c->replicas.nr); + unsigned seq, i; +retry: + ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); + if (unlikely(!ret)) + return NULL; + + percpu_down_read(&c->mark_lock); + + if (nr_replicas != c->replicas.nr) { + nr_replicas = c->replicas.nr; + percpu_up_read(&c->mark_lock); + kfree(ret); + goto retry; + } + + ret->online_reserved = percpu_u64_get(c->online_reserved); + + do { + seq = read_seqcount_begin(&c->usage_lock); + unsafe_memcpy(&ret->u, c->usage_base, + __fs_usage_u64s(nr_replicas) * sizeof(u64), + "embedded variable length struct"); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], + __fs_usage_u64s(nr_replicas)); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +} + +void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) +{ + struct bch_dev *ca; + unsigned i, u64s = fs_usage_u64s(c); + + BUG_ON(idx >= ARRAY_SIZE(c->usage)); + + preempt_disable(); + write_seqcount_begin(&c->usage_lock); + + acc_u64s_percpu((u64 *) c->usage_base, + (u64 __percpu *) c->usage[idx], u64s); + percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + u64s = dev_usage_u64s(); + + acc_u64s_percpu((u64 *) ca->usage_base, + (u64 __percpu *) ca->usage[idx], u64s); + percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); + } + rcu_read_unlock(); + + write_seqcount_end(&c->usage_lock); + preempt_enable(); +} + +void bch2_fs_usage_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_fs_usage_online *fs_usage) +{ + unsigned i; + + prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); + + prt_printf(out, "hidden:\t\t\t\t%llu\n", + fs_usage->u.hidden); + prt_printf(out, "data:\t\t\t\t%llu\n", + fs_usage->u.data); + prt_printf(out, "cached:\t\t\t\t%llu\n", + fs_usage->u.cached); + prt_printf(out, "reserved:\t\t\t%llu\n", + fs_usage->u.reserved); + prt_printf(out, "nr_inodes:\t\t\t%llu\n", + fs_usage->u.nr_inodes); + prt_printf(out, "online reserved:\t\t%llu\n", + fs_usage->online_reserved); + + for (i = 0; + i < ARRAY_SIZE(fs_usage->u.persistent_reserved); + i++) { + prt_printf(out, "%u replicas:\n", i + 1); + prt_printf(out, "\treserved:\t\t%llu\n", + fs_usage->u.persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + prt_printf(out, "\t"); + bch2_replicas_entry_to_text(out, e); + prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); + } +} + +static u64 reserve_factor(u64 r) +{ + return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); +} + +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) +{ + return min(fs_usage->u.hidden + + fs_usage->u.btree + + fs_usage->u.data + + reserve_factor(fs_usage->u.reserved + + fs_usage->online_reserved), + c->capacity); +} + +static struct bch_fs_usage_short +__bch2_fs_usage_read_short(struct bch_fs *c) +{ + struct bch_fs_usage_short ret; + u64 data, reserved; + + ret.capacity = c->capacity - + bch2_fs_usage_read_one(c, &c->usage_base->hidden); + + data = bch2_fs_usage_read_one(c, &c->usage_base->data) + + bch2_fs_usage_read_one(c, &c->usage_base->btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + percpu_u64_get(c->online_reserved); + + ret.used = min(ret.capacity, data + reserve_factor(reserved)); + ret.free = ret.capacity - ret.used; + + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + + return ret; +} + +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *c) +{ + struct bch_fs_usage_short ret; + + percpu_down_read(&c->mark_lock); + ret = __bch2_fs_usage_read_short(c); + percpu_up_read(&c->mark_lock); + + return ret; +} + +void bch2_dev_usage_init(struct bch_dev *ca) +{ + ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; +} + +static inline int bucket_sectors_fragmented(struct bch_dev *ca, + struct bch_alloc_v4 a) +{ + return a.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) + : 0; +} + +static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bch_alloc_v4 old, + struct bch_alloc_v4 new, + u64 journal_seq, bool gc) +{ + struct bch_fs_usage *fs_usage; + struct bch_dev_usage *u; + + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + + if (data_type_is_hidden(old.data_type)) + fs_usage->hidden -= ca->mi.bucket_size; + if (data_type_is_hidden(new.data_type)) + fs_usage->hidden += ca->mi.bucket_size; + + u = dev_usage_ptr(ca, journal_seq, gc); + + u->d[old.data_type].buckets--; + u->d[new.data_type].buckets++; + + u->buckets_ec -= (int) !!old.stripe; + u->buckets_ec += (int) !!new.stripe; + + u->d[old.data_type].sectors -= old.dirty_sectors; + u->d[new.data_type].sectors += new.dirty_sectors; + + u->d[BCH_DATA_cached].sectors += new.cached_sectors; + u->d[BCH_DATA_cached].sectors -= old.cached_sectors; + + u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); + u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + + preempt_enable(); +} + +static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket old, struct bucket new, + u64 journal_seq, bool gc) +{ + struct bch_alloc_v4 old_a = { + .gen = old.gen, + .data_type = old.data_type, + .dirty_sectors = old.dirty_sectors, + .cached_sectors = old.cached_sectors, + .stripe = old.stripe, + }; + struct bch_alloc_v4 new_a = { + .gen = new.gen, + .data_type = new.data_type, + .dirty_sectors = new.dirty_sectors, + .cached_sectors = new.cached_sectors, + .stripe = new.stripe, + }; + + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); +} + +static inline int __update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) +{ + int idx = bch2_replicas_entry_idx(c, r); + + if (idx < 0) + return -1; + + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + return 0; +} + +static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, + struct bch_replicas_entry *r, s64 sectors, + unsigned journal_seq, bool gc) +{ + struct bch_fs_usage *fs_usage; + int idx, ret = 0; + struct printbuf buf = PRINTBUF; + + percpu_down_read(&c->mark_lock); + + idx = bch2_replicas_entry_idx(c, r); + if (idx < 0 && + fsck_err(c, ptr_to_missing_replicas_entry, + "no replicas entry\n while marking %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, r); + percpu_down_read(&c->mark_lock); + + if (ret) + goto err; + idx = bch2_replicas_entry_idx(c, r); + } + if (idx < 0) { + ret = -1; + goto err; + } + + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + preempt_enable(); +err: +fsck_err: + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +static inline int update_cached_sectors(struct bch_fs *c, + struct bkey_s_c k, + unsigned dev, s64 sectors, + unsigned journal_seq, bool gc) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + return update_replicas(c, k, &r.e, sectors, journal_seq, gc); +} + +static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, + gfp_t gfp) +{ + struct replicas_delta_list *d = trans->fs_usage_deltas; + unsigned new_size = d ? (d->size + more) * 2 : 128; + unsigned alloc_size = sizeof(*d) + new_size; + + WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); + + if (!d || d->used + more > d->size) { + d = krealloc(d, alloc_size, gfp|__GFP_ZERO); + + if (unlikely(!d)) { + if (alloc_size > REPLICAS_DELTA_LIST_MAX) + return -ENOMEM; + + d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); + if (!d) + return -ENOMEM; + + memset(d, 0, REPLICAS_DELTA_LIST_MAX); + + if (trans->fs_usage_deltas) + memcpy(d, trans->fs_usage_deltas, + trans->fs_usage_deltas->size + sizeof(*d)); + + new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); + kfree(trans->fs_usage_deltas); + } + + d->size = new_size; + trans->fs_usage_deltas = d; + } + + return 0; +} + +int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) +{ + return allocate_dropping_locks_errcode(trans, + __replicas_deltas_realloc(trans, more, _gfp)); +} + +static inline int update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry *r, + s64 sectors) +{ + struct replicas_delta_list *d; + struct replicas_delta *n; + unsigned b; + int ret; + + if (!sectors) + return 0; + + b = replicas_entry_bytes(r) + 8; + ret = bch2_replicas_deltas_realloc(trans, b); + if (ret) + return ret; + + d = trans->fs_usage_deltas; + n = (void *) d->d + d->used; + n->delta = sectors; + unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r), + r, replicas_entry_bytes(r), + "flexible array member embedded in strcuct with padding"); + bch2_replicas_entry_sort(&n->r); + d->used += b; + return 0; +} + +static inline int update_cached_sectors_list(struct btree_trans *trans, + unsigned dev, s64 sectors) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + return update_replicas_list(trans, &r.e, sectors); +} + +int bch2_mark_alloc(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + u64 bucket_journal_seq; + struct bch_fs *c = trans->c; + struct bch_alloc_v4 old_a_convert, new_a_convert; + const struct bch_alloc_v4 *old_a, *new_a; + struct bch_dev *ca; + int ret = 0; + + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ + if ((flags & BTREE_TRIGGER_GC) && + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + + if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, + "alloc key for invalid device or bucket")) + return -EIO; + + ca = bch_dev_bkey_exists(c, new.k->p.inode); + + old_a = bch2_alloc_to_v4(old, &old_a_convert); + new_a = bch2_alloc_to_v4(new, &new_a_convert); + + bucket_journal_seq = new_a->journal_seq; + + if ((flags & BTREE_TRIGGER_INSERT) && + data_type_is_empty(old_a->data_type) != + data_type_is_empty(new_a->data_type) && + new.k->type == KEY_TYPE_alloc_v4) { + struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; + + EBUG_ON(!journal_seq); + + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + v->journal_seq = bucket_journal_seq = + data_type_is_empty(new_a->data_type) && + (journal_seq == v->journal_seq || + bch2_journal_noflush_seq(&c->journal, v->journal_seq)) + ? 0 : journal_seq; + } + + if (!data_type_is_empty(old_a->data_type) && + data_type_is_empty(new_a->data_type) && + bucket_journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + bucket_journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } + + percpu_down_read(&c->mark_lock); + if (!gc && new_a->gen != old_a->gen) + *bucket_gen(ca, new.k->p.offset) = new_a->gen; + + bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); + + if (gc) { + struct bucket *g = gc_bucket(ca, new.k->p.offset); + + bucket_lock(g); + + g->gen_valid = 1; + g->gen = new_a->gen; + g->data_type = new_a->data_type; + g->stripe = new_a->stripe; + g->stripe_redundancy = new_a->stripe_redundancy; + g->dirty_sectors = new_a->dirty_sectors; + g->cached_sectors = new_a->cached_sectors; + + bucket_unlock(g); + } + percpu_up_read(&c->mark_lock); + + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_a->cached_sectors) { + ret = update_cached_sectors(c, new, ca->dev_idx, + -((s64) old_a->cached_sectors), + journal_seq, gc); + if (ret) { + bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", + __func__); + return ret; + } + } + + if (new_a->data_type == BCH_DATA_free && + (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if (new_a->data_type == BCH_DATA_need_discard && + (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) + bch2_do_discards(c); + + if (old_a->data_type != BCH_DATA_cached && + new_a->data_type == BCH_DATA_cached && + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) + bch2_do_invalidates(c); + + if (new_a->data_type == BCH_DATA_need_gc_gens) + bch2_do_gc_gens(c); + + return 0; +} + +int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type data_type, + unsigned sectors, struct gc_pos pos, + unsigned flags) +{ + struct bucket old, new, *g; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(data_type != BCH_DATA_sb && + data_type != BCH_DATA_journal); + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, b); + + bucket_lock(g); + old = *g; + + if (bch2_fs_inconsistent_on(g->data_type && + g->data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[g->data_type], + bch2_data_types[data_type])) { + ret = -EIO; + goto err; + } + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_types[g->data_type ?: data_type], + g->dirty_sectors, sectors)) { + ret = -EIO; + goto err; + } + + + g->data_type = data_type; + g->dirty_sectors += sectors; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, 0, true); + percpu_up_read(&c->mark_lock); + return ret; +} + +static int check_bucket_ref(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 dirty_sectors, u32 cached_sectors) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); + u32 bucket_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (bucket_data_type == BCH_DATA_cached) + bucket_data_type = BCH_DATA_user; + + if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || + (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) + bucket_data_type = ptr_data_type = BCH_DATA_stripe; + + if (gen_after(ptr->gen, b_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_too_stale, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if (b_gen != ptr->gen && !ptr->cached) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_stale_dirty_ptr, + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + *bucket_gen(ca, bucket_nr), + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if (b_gen != ptr->gen) { + ret = 1; + goto out; + } + + if (!data_type_is_empty(bucket_data_type) && + ptr_data_type && + bucket_data_type != ptr_data_type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type], + bch2_data_types[ptr_data_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if ((u64) bucket_sectors + sectors > U32_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_bucket_sector_count_overflow, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + bucket_sectors, sectors, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } +out: + printbuf_exit(&buf); + return ret; +err: + bch2_dump_trans_updates(trans); + goto out; +} + +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, + unsigned flags) +{ + struct bch_fs *c = trans->c; + u64 journal_seq = trans->journal_res.seq; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, ptr); + + if (g->dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EINVAL; + goto err; + } + + bucket_lock(g); + old = *g; + + ret = check_bucket_ref(trans, k, ptr, sectors, data_type, + g->gen, g->data_type, + g->dirty_sectors, g->cached_sectors); + if (ret) + goto err; + + g->data_type = data_type; + g->dirty_sectors += sectors; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +static int __mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, + u32 *dirty_sectors, u32 *cached_sectors) +{ + u32 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, + bucket_gen, *bucket_data_type, + *dirty_sectors, *cached_sectors); + + if (ret) + return ret; + + *dst_sectors += sectors; + + if (!*dirty_sectors && !*cached_sectors) + *bucket_data_type = 0; + else if (*bucket_data_type != BCH_DATA_stripe) + *bucket_data_type = ptr_data_type; + + return 0; +} + +static int bch2_mark_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct extent_ptr_decoded p, + s64 sectors, + unsigned flags) +{ + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket old, new, *g; + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + u8 bucket_data_type; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, &p.ptr); + bucket_lock(g); + old = *g; + + bucket_data_type = g->data_type; + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (!ret) + g->data_type = bucket_data_type; + + new = *g; + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + percpu_up_read(&c->mark_lock); + + return ret; +} + +static int bch2_mark_stripe_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + s64 sectors, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_replicas_padded r; + struct gc_stripe *m; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.idx); + return -BCH_ERR_ENOMEM_mark_stripe_ptr; + } + + mutex_lock(&c->ec_stripes_heap_lock); + + if (!m || !m->alive) { + mutex_unlock(&c->ec_stripes_heap_lock); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", + (u64) p.idx); + bch2_inconsistent_error(c); + return -EIO; + } + + m->block_sectors[p.block] += sectors; + + r = m->r; + mutex_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; + update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + + return 0; +} + +static int __mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + enum bch_data_type data_type = bkey_is_btree_ptr(k.k) + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) + ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; + int ret; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = ptr_disk_sectors(sectors, p); + + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; + + ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); + if (ret < 0) + return ret; + + stale = ret > 0; + + if (p.ptr.cached) { + if (!stale) { + ret = update_cached_sectors(c, k, p.ptr.dev, + disk_sectors, journal_seq, true); + if (ret) { + bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", + __func__); + return ret; + } + } + } else if (!p.has_ec) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, + disk_sectors, flags); + if (ret) + return ret; + + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an + * erasure coded pointer in this extent: + */ + r.e.nr_required = 0; + } + } + + if (r.e.nr_devs) { + ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); + if (ret) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); + printbuf_exit(&buf); + return ret; + } + } + + return 0; +} + +int bch2_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags); +} + +int bch2_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + u64 idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; + unsigned i; + int ret; + + BUG_ON(gc && old_s); + + if (!gc) { + struct stripe *m = genradix_ptr(&c->stripes, idx); + + if (!m) { + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + + bch2_bkey_val_to_text(&buf1, c, old); + bch2_bkey_val_to_text(&buf2, c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" + "new %s", idx, buf1.buf, buf2.buf); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + bch2_inconsistent_error(c); + return -1; + } + + if (!new_s) { + bch2_stripes_heap_del(c, m, idx); + + memset(m, 0, sizeof(*m)); + } else { + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + + if (!old_s) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_update(c, m, idx); + } + } else { + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + idx); + return -BCH_ERR_ENOMEM_mark_stripe; + } + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + + for (i = 0; i < new_s->nr_blocks; i++) + m->ptrs[i] = new_s->ptrs[i]; + + bch2_bkey_to_replicas(&m->r.e, new); + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + + for (i = 0; i < new_s->nr_blocks; i++) { + ret = mark_stripe_bucket(trans, new, i, flags); + if (ret) + return ret; + } + + ret = update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + journal_seq, gc); + if (ret) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); + return ret; + } + } + + return 0; +} + +static int __mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + + fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); + + fs_usage->reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; + + preempt_enable(); + percpu_up_read(&c->mark_lock); + + return 0; +} + +int bch2_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags); +} + +static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 start, u64 end, + u64 *idx, unsigned flags, size_t r_idx) +{ + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 next_idx = end; + s64 ret = 0; + struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + *idx = r->offset; + return 0; +not_found: + if (fsck_err(c, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { + struct bkey_i_error *new; + + new = bch2_trans_kmalloc(trans, sizeof(*new)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_init(&new->k); + new->k.type = KEY_TYPE_error; + new->k.p = bkey_start_pos(p.k); + new->k.p.offset += *idx - start; + bch2_key_resize(&new->k, next_idx - *idx); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, + BTREE_TRIGGER_NORUN); + } + + *idx = next_idx; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int __mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; + u64 idx = le64_to_cpu(p.v->idx), start = idx; + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { + idx -= le32_to_cpu(p.v->front_pad); + end += le32_to_cpu(p.v->back_pad); + } + + l = 0; + r = c->reflink_gc_nr; + while (l < r) { + m = l + (r - l) / 2; + + ref = genradix_ptr(&c->reflink_gc_table, m); + if (ref->offset <= idx) + l = m + 1; + else + r = m; + } + + while (idx < end && !ret) + ret = __bch2_mark_reflink_p(trans, p, start, end, + &idx, flags, l++); + + return ret; +} + +int bch2_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags); +} + +void bch2_trans_fs_usage_revert(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *dst; + struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; + s64 added = 0; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + /* revert changes: */ + for (d = deltas->d; d != top; d = replicas_delta_next(d)) { + switch (d->r.data_type) { + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + added += d->delta; + } + BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); + } + + dst->nr_inodes -= deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + added -= deltas->persistent_reserved[i]; + dst->reserved -= deltas->persistent_reserved[i]; + dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; + } + + if (added > 0) { + trans->disk_res->sectors += added; + this_cpu_add(*c->online_reserved, added); + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); +} + +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + static int warned_disk_usage = 0; + bool warn = false; + u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + struct replicas_delta *d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + s64 added = 0, should_not_have_added; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) { + switch (d->r.data_type) { + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + added += d->delta; + } + + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; + } + + dst->nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + added += deltas->persistent_reserved[i]; + dst->reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + /* + * Not allowed to reduce sectors_available except by getting a + * reservation: + */ + should_not_have_added = added - (s64) disk_res_sectors; + if (unlikely(should_not_have_added > 0)) { + u64 old, new, v = atomic64_read(&c->sectors_available); + + do { + old = v; + new = max_t(s64, 0, old - should_not_have_added); + } while ((v = atomic64_cmpxchg(&c->sectors_available, + old, new)) != old); + + added -= should_not_have_added; + warn = true; + } + + if (added > 0) { + trans->disk_res->sectors -= added; + this_cpu_sub(*c->online_reserved, added); + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + bch2_trans_inconsistent(trans, + "disk usage increased %lli more than %llu sectors reserved)", + should_not_have_added, disk_res_sectors); + return 0; +need_mark: + /* revert changes: */ + for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) + BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + return -1; +} + +/* trans_mark: */ + +static inline int bch2_trans_mark_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + unsigned flags) +{ + bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + struct bpos bucket; + struct bch_backpointer bp; + s64 sectors; + int ret; + + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); + sectors = bp.bucket_len; + if (!insert) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, bucket); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, + a->v.gen, &a->v.data_type, + &a->v.dirty_sectors, &a->v.cached_sectors) ?: + bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + if (!p.ptr.cached) { + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) +{ + struct btree_iter iter; + struct bkey_i_stripe *s; + struct bch_replicas_padded r; + int ret = 0; + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_WITH_UPDATES, stripe); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); + goto err; + } + + if (!bch2_ptr_matches_stripe(&s->v, p)) { + bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; + goto err; + } + + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + ret = update_replicas_list(trans, &r.e, sectors); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + enum bch_data_type data_type = bkey_is_btree_ptr(k.k) + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) + ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; + int ret = 0; + + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = ptr_disk_sectors(sectors, p); + + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; + + ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); + if (ret < 0) + return ret; + + stale = ret > 0; + + if (p.ptr.cached) { + if (!stale) { + ret = update_cached_sectors_list(trans, p.ptr.dev, + disk_sectors); + if (ret) + return ret; + } + } else if (!p.has_ec) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + ret = bch2_trans_mark_stripe_ptr(trans, p, + disk_sectors, data_type); + if (ret) + return ret; + + r.e.nr_required = 0; + } + } + + if (r.e.nr_devs) + ret = update_replicas_list(trans, &r.e, dirty_sectors); + + return ret; +} + +int bch2_trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) - + (int) bch2_bkey_needs_rebalance(c, old); + + if (mod) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0); + if (ret) + return ret; + } + + return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags); +} + +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + int ret = 0; + + if (deleting) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, + a->v.gen, a->v.data_type, + a->v.dirty_sectors, a->v.cached_sectors); + if (ret) + goto err; + + if (!deleting) { + if (bch2_trans_inconsistent_on(a->v.stripe || + a->v.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + a->v.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + + a->v.stripe = s.k->p.offset; + a->v.stripe_redundancy = s.v->nr_redundant; + a->v.data_type = BCH_DATA_stripe; + } else { + if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || + a->v.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, a->v.gen, + s.k->p.offset, a->v.stripe)) { + ret = -EIO; + goto err; + } + + a->v.stripe = 0; + a->v.stripe_redundancy = 0; + a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + } + + a->v.dirty_sectors += sectors; + if (data_type) + a->v.data_type = !deleting ? data_type : 0; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_trans_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + const struct bch_stripe *old_s = NULL; + struct bch_stripe *new_s = NULL; + struct bch_replicas_padded r; + unsigned i, nr_blocks; + int ret = 0; + + if (old.k->type == KEY_TYPE_stripe) + old_s = bkey_s_c_to_stripe(old).v; + if (new->k.type == KEY_TYPE_stripe) + new_s = &bkey_i_to_stripe(new)->v; + + /* + * If the pointers aren't changing, we don't need to do anything: + */ + if (new_s && old_s && + new_s->nr_blocks == old_s->nr_blocks && + new_s->nr_redundant == old_s->nr_redundant && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); + + nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + + if (new_s) { + s64 sectors = le16_to_cpu(new_s->sectors); + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); + ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + if (ret) + return ret; + } + + if (old_s) { + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + bch2_bkey_to_replicas(&r.e, old); + ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + if (ret) + return ret; + } + + for (i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_i_to_s_c_stripe(new), i, false); + if (ret) + break; + } + + if (old_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true); + if (ret) + break; + } + } + + return ret; +} + +static int __trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; + int ret; + + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; + + ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; + + d = trans->fs_usage_deltas; + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(d->persistent_reserved)); + + d->persistent_reserved[replicas - 1] += sectors; + return 0; +} + +int bch2_trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags); +} + +static int trans_mark_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i *k; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + struct printbuf buf = PRINTBUF; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, + BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_WITH_UPDATES); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + refcount = bkey_refcount(k); + if (!refcount) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + + le64_add_cpu(refcount, add); + + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, k, 0); + if (ret) + goto err; + + *idx = k->k.p.offset; +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static int __trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, unsigned flags) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx, end_idx; + int ret = 0; + + idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + end_idx = le64_to_cpu(p.v->idx) + p.k->size + + le32_to_cpu(p.v->back_pad); + + while (idx < end_idx && !ret) + ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); + return ret; +} + +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; + + v->front_pad = v->back_pad = 0; + } + + return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); +} + +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + int ret = 0; + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + if (IS_ERR(a)) + return PTR_ERR(a); + + if (a->v.data_type && type && a->v.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + BCH_FSCK_ERR_bucket_metadata_type_mismatch, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto err; + } + + if (a->v.data_type != type || + a->v.dirty_sectors != sectors) { + a->v.data_type = type; + a->v.dirty_sectors = sectors; + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + return commit_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); +} + +static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, + struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + u64 *bucket, unsigned *bucket_sectors) +{ + do { + u64 b = sector_to_bucket(ca, start); + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + + if (b != *bucket && *bucket_sectors) { + int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, + type, *bucket_sectors); + if (ret) + return ret; + + *bucket_sectors = 0; + } + + *bucket = b; + *bucket_sectors += sectors; + start += sectors; + } while (start < end); + + return 0; +} + +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + struct bch_dev *ca) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 bucket = 0; + unsigned i, bucket_sectors = 0; + int ret; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) { + ret = bch2_trans_mark_metadata_sectors(trans, ca, + 0, BCH_SB_SECTOR, + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + if (bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, ca, + bucket, BCH_DATA_sb, bucket_sectors); + if (ret) + return ret; + } + + for (i = 0; i < ca->journal.nr; i++) { + ret = bch2_trans_mark_metadata_bucket(trans, ca, + ca->journal.buckets[i], + BCH_DATA_journal, ca->mi.bucket_size); + if (ret) + return ret; + } + + return 0; +} + +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +{ + int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca)); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +int bch2_trans_mark_dev_sbs(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) { + int ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + percpu_ref_put(&ca->ref); + return ret; + } + } + + return 0; +} + +/* Disk reservations: */ + +#define SECTORS_CACHE 1024 + +int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + u64 sectors, int flags) +{ + struct bch_fs_pcpu *pcpu; + u64 old, v, get; + s64 sectors_available; + int ret; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + pcpu = this_cpu_ptr(c->pcpu); + + if (sectors <= pcpu->sectors_available) + goto out; + + v = atomic64_read(&c->sectors_available); + do { + old = v; + get = min((u64) sectors + SECTORS_CACHE, old); + + if (get < sectors) { + preempt_enable(); + goto recalculate; + } + } while ((v = atomic64_cmpxchg(&c->sectors_available, + old, old - get)) != old); + + pcpu->sectors_available += get; + +out: + pcpu->sectors_available -= sectors; + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + + preempt_enable(); + percpu_up_read(&c->mark_lock); + return 0; + +recalculate: + mutex_lock(&c->sectors_available_lock); + + percpu_u64_set(&c->pcpu->sectors_available, 0); + sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); + + if (sectors <= sectors_available || + (flags & BCH_DISK_RESERVATION_NOFAIL)) { + atomic64_set(&c->sectors_available, + max_t(s64, 0, sectors_available - sectors)); + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + ret = 0; + } else { + atomic64_set(&c->sectors_available, sectors_available); + ret = -BCH_ERR_ENOSPC_disk_reservation; + } + + mutex_unlock(&c->sectors_available_lock); + percpu_up_read(&c->mark_lock); + + return ret; +} + +/* Startup/shutdown: */ + +static void bucket_gens_free_rcu(struct rcu_head *rcu) +{ + struct bucket_gens *buckets = + container_of(rcu, struct bucket_gens, rcu); + + kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); +} + +int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +{ + struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; + unsigned long *buckets_nouse = NULL; + bool resize = ca->bucket_gens != NULL; + int ret; + + if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO))) { + ret = -BCH_ERR_ENOMEM_bucket_gens; + goto err; + } + + if ((c->opts.buckets_nouse && + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)))) { + ret = -BCH_ERR_ENOMEM_buckets_nouse; + goto err; + } + + bucket_gens->first_bucket = ca->mi.first_bucket; + bucket_gens->nbuckets = nbuckets; + + if (resize) { + down_write(&c->gc_lock); + down_write(&ca->bucket_lock); + percpu_down_write(&c->mark_lock); + } + + old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); + + if (resize) { + size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); + + memcpy(bucket_gens->b, + old_bucket_gens->b, + n); + if (buckets_nouse) + memcpy(buckets_nouse, + ca->buckets_nouse, + BITS_TO_LONGS(n) * sizeof(unsigned long)); + } + + rcu_assign_pointer(ca->bucket_gens, bucket_gens); + bucket_gens = old_bucket_gens; + + swap(ca->buckets_nouse, buckets_nouse); + + nbuckets = ca->mi.nbuckets; + + if (resize) { + percpu_up_write(&c->mark_lock); + up_write(&ca->bucket_lock); + up_write(&c->gc_lock); + } + + ret = 0; +err: + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + if (bucket_gens) + call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); + + return ret; +} + +void bch2_dev_buckets_free(struct bch_dev *ca) +{ + unsigned i; + + kvpfree(ca->buckets_nouse, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), + sizeof(struct bucket_gens) + ca->mi.nbuckets); + + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); + kfree(ca->usage_base); +} + +int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); + if (!ca->usage_base) + return -BCH_ERR_ENOMEM_usage_init; + + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + ca->usage[i] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[i]) + return -BCH_ERR_ENOMEM_usage_init; + } + + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 index 000000000000..21f6cb356921 --- /dev/null +++ b/fs/bcachefs/buckets.h @@ -0,0 +1,458 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. + * + * Copyright 2014 Datera, Inc. + */ + +#ifndef _BUCKETS_H +#define _BUCKETS_H + +#include "buckets_types.h" +#include "extents.h" +#include "sb-members.h" + +static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +{ + return div_u64(s, ca->mi.bucket_size); +} + +static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) +{ + return ((sector_t) b) * ca->mi.bucket_size; +} + +static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) +{ + u32 remainder; + + div_u64_rem(s, ca->mi.bucket_size, &remainder); + return remainder; +} + +static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, + u32 *offset) +{ + return div_u64_rem(s, ca->mi.bucket_size, offset); +} + +#define for_each_bucket(_b, _buckets) \ + for (_b = (_buckets)->b + (_buckets)->first_bucket; \ + _b < (_buckets)->b + (_buckets)->nbuckets; _b++) + +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + +static inline void bucket_unlock(struct bucket *b) +{ + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + + clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); + wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); +} + +static inline void bucket_lock(struct bucket *b) +{ + wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR, + TASK_UNINTERRUPTIBLE); +} + +static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->buckets_gc, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +} + +static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) +{ + struct bucket_array *buckets = gc_bucket_array(ca); + + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; +} + +static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->bucket_gens, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +} + +static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) +{ + struct bucket_gens *gens = bucket_gens(ca); + + BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + return gens->b + b; +} + +static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return sector_to_bucket(ca, ptr->offset); +} + +static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, + const struct bch_extent_ptr *ptr) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); +} + +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, + const struct bch_extent_ptr *ptr, + u32 *bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); +} + +static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); +} + +static inline enum bch_data_type ptr_data_type(const struct bkey *k, + const struct bch_extent_ptr *ptr) +{ + if (bkey_is_btree_ptr(k)) + return BCH_DATA_btree; + + return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; +} + +static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) +{ + EBUG_ON(sectors < 0); + + return crc_is_compressed(p.crc) + ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, + p.crc.uncompressed_size) + : sectors; +} + +static inline int gen_cmp(u8 a, u8 b) +{ + return (s8) (a - b); +} + +static inline int gen_after(u8 a, u8 b) +{ + int r = gen_cmp(a, b); + + return r > 0 ? r : 0; +} + +/** + * ptr_stale() - check if a pointer points into a bucket that has been + * invalidated. + */ +static inline u8 ptr_stale(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + u8 ret; + + rcu_read_lock(); + ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + rcu_read_unlock(); + + return ret; +} + +/* Device usage: */ + +void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); +static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) +{ + struct bch_dev_usage ret; + + bch2_dev_usage_read_fast(ca, &ret); + return ret; +} + +void bch2_dev_usage_init(struct bch_dev *); + +static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) +{ + s64 reserved = 0; + + switch (watermark) { + case BCH_WATERMARK_NR: + BUG(); + case BCH_WATERMARK_stripe: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case BCH_WATERMARK_normal: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case BCH_WATERMARK_copygc: + reserved += ca->nr_btree_reserve; + fallthrough; + case BCH_WATERMARK_btree: + reserved += ca->nr_btree_reserve; + fallthrough; + case BCH_WATERMARK_btree_copygc: + case BCH_WATERMARK_reclaim: + break; + } + + return reserved; +} + +static inline u64 dev_buckets_free(struct bch_dev *ca, + struct bch_dev_usage usage, + enum bch_watermark watermark) +{ + return max_t(s64, 0, + usage.d[BCH_DATA_free].buckets - + ca->nr_open_buckets - + bch2_dev_buckets_reserved(ca, watermark)); +} + +static inline u64 __dev_buckets_available(struct bch_dev *ca, + struct bch_dev_usage usage, + enum bch_watermark watermark) +{ + return max_t(s64, 0, + usage.d[BCH_DATA_free].buckets + + usage.d[BCH_DATA_cached].buckets + + usage.d[BCH_DATA_need_gc_gens].buckets + + usage.d[BCH_DATA_need_discard].buckets + - ca->nr_open_buckets + - bch2_dev_buckets_reserved(ca, watermark)); +} + +static inline u64 dev_buckets_available(struct bch_dev *ca, + enum bch_watermark watermark) +{ + return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark); +} + +/* Filesystem usage: */ + +static inline unsigned __fs_usage_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas; +} + +static inline unsigned fs_usage_u64s(struct bch_fs *c) +{ + return __fs_usage_u64s(READ_ONCE(c->replicas.nr)); +} + +static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas; +} + +static inline unsigned fs_usage_online_u64s(struct bch_fs *c) +{ + return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr)); +} + +static inline unsigned dev_usage_u64s(void) +{ + return sizeof(struct bch_dev_usage) / sizeof(u64); +} + +u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); + +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); + +void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); + +void bch2_fs_usage_to_text(struct printbuf *, + struct bch_fs *, struct bch_fs_usage_online *); + +u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); + +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *); + +/* key/bucket marking: */ + +static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + unsigned journal_seq, + bool gc) +{ + percpu_rwsem_assert_held(&c->mark_lock); + BUG_ON(!gc && !journal_seq); + + return this_cpu_ptr(gc + ? c->usage_gc + : c->usage[journal_seq & JOURNAL_BUF_MASK]); +} + +int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); + +void bch2_fs_usage_initialize(struct bch_fs *); + +int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + +int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); + +#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ +({ \ + int ret = 0; \ + \ + if (_old.k->type) \ + ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ + if (!ret && _new.k->type) \ + ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ + ret; \ +}) + +#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ + mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) + +void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); +int bch2_trans_mark_dev_sbs(struct bch_fs *); + +static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + return false; +} + +/* disk reservations: */ + +static inline void bch2_disk_reservation_put(struct bch_fs *c, + struct disk_reservation *res) +{ + if (res->sectors) { + this_cpu_sub(*c->online_reserved, res->sectors); + res->sectors = 0; + } +} + +#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) + +int __bch2_disk_reservation_add(struct bch_fs *, + struct disk_reservation *, + u64, int); + +static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + u64 sectors, int flags) +{ +#ifdef __KERNEL__ + u64 old, new; + + do { + old = this_cpu_read(c->pcpu->sectors_available); + if (sectors > old) + return __bch2_disk_reservation_add(c, res, sectors, flags); + + new = old - sectors; + } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old); + + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + return 0; +#else + return __bch2_disk_reservation_add(c, res, sectors, flags); +#endif +} + +static inline struct disk_reservation +bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) +{ + return (struct disk_reservation) { + .sectors = 0, +#if 0 + /* not used yet: */ + .gen = c->capacity_gen, +#endif + .nr_replicas = nr_replicas, + }; +} + +static inline int bch2_disk_reservation_get(struct bch_fs *c, + struct disk_reservation *res, + u64 sectors, unsigned nr_replicas, + int flags) +{ + *res = bch2_disk_reservation_init(c, nr_replicas); + + return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); +} + +#define RESERVE_FACTOR 6 + +static inline u64 avail_factor(u64 r) +{ + return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); +} + +int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); +void bch2_dev_buckets_free(struct bch_dev *); +int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); + +#endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h new file mode 100644 index 000000000000..2a9dab9006ef --- /dev/null +++ b/fs/bcachefs/buckets_types.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_TYPES_H +#define _BUCKETS_TYPES_H + +#include "bcachefs_format.h" +#include "util.h" + +#define BUCKET_JOURNAL_SEQ_BITS 16 + +struct bucket { + u8 lock; + u8 gen_valid:1; + u8 data_type:7; + u8 gen; + u8 stripe_redundancy; + u32 stripe; + u32 dirty_sectors; + u32 cached_sectors; +}; + +struct bucket_array { + struct rcu_head rcu; + u16 first_bucket; + size_t nbuckets; + struct bucket b[]; +}; + +struct bucket_gens { + struct rcu_head rcu; + u16 first_bucket; + size_t nbuckets; + u8 b[]; +}; + +struct bch_dev_usage { + u64 buckets_ec; + + struct { + u64 buckets; + u64 sectors; /* _compressed_ sectors: */ + /* + * XXX + * Why do we have this? Isn't it just buckets * bucket_size - + * sectors? + */ + u64 fragmented; + } d[BCH_DATA_NR]; +}; + +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + u64 hidden; + u64 btree; + u64 data; + u64 cached; + u64 reserved; + u64 nr_inodes; + + /* XXX: add stats for compression ratio */ +#if 0 + u64 uncompressed; + u64 compressed; +#endif + + /* broken out: */ + + u64 persistent_reserved[BCH_REPLICAS_MAX]; + u64 replicas[]; +}; + +struct bch_fs_usage_online { + u64 online_reserved; + struct bch_fs_usage u; +}; + +struct bch_fs_usage_short { + u64 capacity; + u64 used; + u64 free; + u64 nr_inodes; +}; + +/* + * A reservation for space on disk: + */ +struct disk_reservation { + u64 sectors; + u32 gen; + unsigned nr_replicas; +}; + +#endif /* _BUCKETS_TYPES_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c new file mode 100644 index 000000000000..ec1b636ef78d --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets_waiting_for_journal.h" +#include <linux/hash.h> +#include <linux/random.h> + +static inline struct bucket_hashed * +bucket_hash(struct buckets_waiting_for_journal_table *t, + unsigned hash_seed_idx, u64 dev_bucket) +{ + return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); +} + +static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) +{ + unsigned i; + + t->bits = bits; + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) + get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); + memset(t->d, 0, sizeof(t->d[0]) << t->bits); +} + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket) +{ + struct buckets_waiting_for_journal_table *t; + u64 dev_bucket = (u64) dev << 56 | bucket; + bool ret = false; + unsigned i; + + mutex_lock(&b->lock); + t = b->t; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); + + if (h->dev_bucket == dev_bucket) { + ret = h->journal_seq > flushed_seq; + break; + } + } + + mutex_unlock(&b->lock); + + return ret; +} + +static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, + struct bucket_hashed *new, + u64 flushed_seq) +{ + struct bucket_hashed *last_evicted = NULL; + unsigned tries, i; + + for (tries = 0; tries < 10; tries++) { + struct bucket_hashed *old, *victim = NULL; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + old = bucket_hash(t, i, new->dev_bucket); + + if (old->dev_bucket == new->dev_bucket || + old->journal_seq <= flushed_seq) { + *old = *new; + return true; + } + + if (last_evicted != old) + victim = old; + } + + /* hashed to same slot 3 times: */ + if (!victim) + break; + + /* Failed to find an empty slot: */ + swap(*new, *victim); + last_evicted = victim; + } + + return false; +} + +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket, + u64 journal_seq) +{ + struct buckets_waiting_for_journal_table *t, *n; + struct bucket_hashed tmp, new = { + .dev_bucket = (u64) dev << 56 | bucket, + .journal_seq = journal_seq, + }; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; + int ret = 0; + + mutex_lock(&b->lock); + + if (likely(bucket_table_insert(b->t, &new, flushed_seq))) + goto out; + + t = b->t; + size = 1UL << t->bits; + for (i = 0; i < size; i++) + nr_elements += t->d[i].journal_seq > flushed_seq; + + new_bits = t->bits + (nr_elements * 3 > size); + + n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); + if (!n) { + ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; + goto out; + } + +retry_rehash: + nr_rehashes++; + bucket_table_init(n, new_bits); + + tmp = new; + BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); + + for (i = 0; i < 1UL << t->bits; i++) { + if (t->d[i].journal_seq <= flushed_seq) + continue; + + tmp = t->d[i]; + if (!bucket_table_insert(n, &tmp, flushed_seq)) + goto retry_rehash; + } + + b->t = n; + kvfree(t); + + pr_debug("took %zu rehashes, table at %zu/%lu elements", + nr_rehashes, nr_elements, 1UL << b->t->bits); +out: + mutex_unlock(&b->lock); + + return ret; +} + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + kvfree(b->t); +} + +#define INITIAL_TABLE_BITS 3 + +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + mutex_init(&b->lock); + + b->t = kvmalloc(sizeof(*b->t) + + (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); + if (!b->t) + return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init; + + bucket_table_init(b->t, INITIAL_TABLE_BITS); + return 0; +} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h new file mode 100644 index 000000000000..d2ae19cbe18c --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H +#define _BUCKETS_WAITING_FOR_JOURNAL_H + +#include "buckets_waiting_for_journal_types.h" + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64); +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64, u64); + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h new file mode 100644 index 000000000000..e593db061d81 --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H +#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H + +#include <linux/siphash.h> + +struct bucket_hashed { + u64 dev_bucket; + u64 journal_seq; +}; + +struct buckets_waiting_for_journal_table { + unsigned bits; + u64 hash_seeds[3]; + struct bucket_hashed d[]; +}; + +struct buckets_waiting_for_journal { + struct mutex lock; + struct buckets_waiting_for_journal_table *t; +}; + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c new file mode 100644 index 000000000000..4bb88aefed12 --- /dev/null +++ b/fs/bcachefs/chardev.c @@ -0,0 +1,784 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_CHARDEV + +#include "bcachefs.h" +#include "bcachefs_ioctl.h" +#include "buckets.h" +#include "chardev.h" +#include "journal.h" +#include "move.h" +#include "replicas.h" +#include "super.h" +#include "super-io.h" + +#include <linux/anon_inodes.h> +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/ioctl.h> +#include <linux/kthread.h> +#include <linux/major.h> +#include <linux/sched/task.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +/* returns with ref on ca->ref */ +static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, + unsigned flags) +{ + struct bch_dev *ca; + + if (flags & BCH_BY_INDEX) { + if (dev >= c->sb.nr_devices) + return ERR_PTR(-EINVAL); + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + if (!ca) + return ERR_PTR(-EINVAL); + } else { + char *path; + + path = strndup_user((const char __user *) + (unsigned long) dev, PATH_MAX); + if (IS_ERR(path)) + return ERR_CAST(path); + + ca = bch2_dev_lookup(c, path); + kfree(path); + } + + return ca; +} + +#if 0 +static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) +{ + struct bch_ioctl_assemble arg; + struct bch_fs *c; + u64 *user_devs = NULL; + char **devs = NULL; + unsigned i; + int ret = -EFAULT; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); + if (!user_devs) + return -ENOMEM; + + devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); + + if (copy_from_user(user_devs, user_arg->devs, + sizeof(u64) * arg.nr_devs)) + goto err; + + for (i = 0; i < arg.nr_devs; i++) { + devs[i] = strndup_user((const char __user *)(unsigned long) + user_devs[i], + PATH_MAX); + ret= PTR_ERR_OR_ZERO(devs[i]); + if (ret) + goto err; + } + + c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); + ret = PTR_ERR_OR_ZERO(c); + if (!ret) + closure_put(&c->cl); +err: + if (devs) + for (i = 0; i < arg.nr_devs; i++) + kfree(devs[i]); + kfree(devs); + return ret; +} + +static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) +{ + struct bch_ioctl_incremental arg; + const char *err; + char *path; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; + + err = bch2_fs_open_incremental(path); + kfree(path); + + if (err) { + pr_err("Could not register bcachefs devices: %s", err); + return -EINVAL; + } + + return 0; +} +#endif + +static long bch2_global_ioctl(unsigned cmd, void __user *arg) +{ + switch (cmd) { +#if 0 + case BCH_IOCTL_ASSEMBLE: + return bch2_ioctl_assemble(arg); + case BCH_IOCTL_INCREMENTAL: + return bch2_ioctl_incremental(arg); +#endif + default: + return -ENOTTY; + } +} + +static long bch2_ioctl_query_uuid(struct bch_fs *c, + struct bch_ioctl_query_uuid __user *user_arg) +{ + if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid, + sizeof(c->sb.user_uuid))) + return -EFAULT; + return 0; +} + +#if 0 +static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.flags || arg.pad) + return -EINVAL; + + return bch2_fs_start(c); +} + +static long bch2_ioctl_stop(struct bch_fs *c) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + bch2_fs_stop(c); + return 0; +} +#endif + +static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + char *path; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; + + ret = bch2_dev_add(c, path); + kfree(path); + + return ret; +} + +static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + struct bch_dev *ca; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| + BCH_BY_INDEX)) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + return bch2_dev_remove(c, ca, arg.flags); +} + +static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + char *path; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + ret = PTR_ERR_OR_ZERO(path); + if (ret) + return ret; + + ret = bch2_dev_online(c, path); + kfree(path); + return ret; +} + +static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| + BCH_BY_INDEX)) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_offline(c, ca, arg.flags); + percpu_ref_put(&ca->ref); + return ret; +} + +static long bch2_ioctl_disk_set_state(struct bch_fs *c, + struct bch_ioctl_disk_set_state arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| + BCH_BY_INDEX)) || + arg.pad[0] || arg.pad[1] || arg.pad[2] || + arg.new_state >= BCH_MEMBER_STATE_NR) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); + if (ret) + bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); + + percpu_ref_put(&ca->ref); + return ret; +} + +struct bch_data_ctx { + struct bch_fs *c; + struct bch_ioctl_data arg; + struct bch_move_stats stats; + + int ret; + + struct task_struct *thread; +}; + +static int bch2_data_thread(void *arg) +{ + struct bch_data_ctx *ctx = arg; + + ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + + ctx->stats.data_type = U8_MAX; + return 0; +} + +static int bch2_data_job_release(struct inode *inode, struct file *file) +{ + struct bch_data_ctx *ctx = file->private_data; + + kthread_stop(ctx->thread); + put_task_struct(ctx->thread); + kfree(ctx); + return 0; +} + +static ssize_t bch2_data_job_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct bch_data_ctx *ctx = file->private_data; + struct bch_fs *c = ctx->c; + struct bch_ioctl_data_event e = { + .type = BCH_DATA_EVENT_PROGRESS, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_total = bch2_fs_usage_read_short(c).used, + }; + + if (len < sizeof(e)) + return -EINVAL; + + if (copy_to_user(buf, &e, sizeof(e))) + return -EFAULT; + + return sizeof(e); +} + +static const struct file_operations bcachefs_data_ops = { + .release = bch2_data_job_release, + .read = bch2_data_job_read, + .llseek = no_llseek, +}; + +static long bch2_ioctl_data(struct bch_fs *c, + struct bch_ioctl_data arg) +{ + struct bch_data_ctx *ctx = NULL; + struct file *file = NULL; + unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; + int ret, fd = -1; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.op >= BCH_DATA_OP_NR || arg.flags) + return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->c = c; + ctx->arg = arg; + + ctx->thread = kthread_create(bch2_data_thread, ctx, + "bch-data/%s", c->name); + if (IS_ERR(ctx->thread)) { + ret = PTR_ERR(ctx->thread); + goto err; + } + + ret = get_unused_fd_flags(flags); + if (ret < 0) + goto err; + fd = ret; + + file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err; + } + + fd_install(fd, file); + + get_task_struct(ctx->thread); + wake_up_process(ctx->thread); + + return fd; +err: + if (fd >= 0) + put_unused_fd(fd); + if (!IS_ERR_OR_NULL(ctx->thread)) + kthread_stop(ctx->thread); + kfree(ctx); + return ret; +} + +static long bch2_ioctl_fs_usage(struct bch_fs *c, + struct bch_ioctl_fs_usage __user *user_arg) +{ + struct bch_ioctl_fs_usage *arg = NULL; + struct bch_replicas_usage *dst_e, *dst_end; + struct bch_fs_usage_online *src; + u32 replica_entries_bytes; + unsigned i; + int ret = 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + + if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) + return -EFAULT; + + arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL); + if (!arg) + return -ENOMEM; + + src = bch2_fs_usage_read(c); + if (!src) { + ret = -ENOMEM; + goto err; + } + + arg->capacity = c->capacity; + arg->used = bch2_fs_sectors_used(c, src); + arg->online_reserved = src->online_reserved; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + arg->persistent_reserved[i] = src->u.persistent_reserved[i]; + + dst_e = arg->replicas; + dst_end = (void *) arg->replicas + replica_entries_bytes; + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *src_e = + cpu_replicas_entry(&c->replicas, i); + + /* check that we have enough space for one replicas entry */ + if (dst_e + 1 > dst_end) { + ret = -ERANGE; + break; + } + + dst_e->sectors = src->u.replicas[i]; + dst_e->r = *src_e; + + /* recheck after setting nr_devs: */ + if (replicas_usage_next(dst_e) > dst_end) { + ret = -ERANGE; + break; + } + + memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); + + dst_e = replicas_usage_next(dst_e); + } + + arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; + + percpu_up_read(&c->mark_lock); + kfree(src); + + if (ret) + goto err; + if (copy_to_user(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes)) + ret = -EFAULT; +err: + kfree(arg); + return ret; +} + +static long bch2_ioctl_dev_usage(struct bch_fs *c, + struct bch_ioctl_dev_usage __user *user_arg) +{ + struct bch_ioctl_dev_usage arg; + struct bch_dev_usage src; + struct bch_dev *ca; + unsigned i; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad[0] || + arg.pad[1] || + arg.pad[2]) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + src = bch2_dev_usage_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + arg.buckets_ec = src.buckets_ec; + + for (i = 0; i < BCH_DATA_NR; i++) { + arg.d[i].buckets = src.d[i].buckets; + arg.d[i].sectors = src.d[i].sectors; + arg.d[i].fragmented = src.d[i].fragmented; + } + + percpu_ref_put(&ca->ref); + + if (copy_to_user(user_arg, &arg, sizeof(arg))) + return -EFAULT; + + return 0; +} + +static long bch2_ioctl_read_super(struct bch_fs *c, + struct bch_ioctl_read_super arg) +{ + struct bch_dev *ca = NULL; + struct bch_sb *sb; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || + arg.pad) + return -EINVAL; + + mutex_lock(&c->sb_lock); + + if (arg.flags & BCH_READ_DEV) { + ca = bch2_device_lookup(c, arg.dev, arg.flags); + + if (IS_ERR(ca)) { + ret = PTR_ERR(ca); + goto err; + } + + sb = ca->disk_sb.sb; + } else { + sb = c->disk_sb.sb; + } + + if (vstruct_bytes(sb) > arg.size) { + ret = -ERANGE; + goto err; + } + + if (copy_to_user((void __user *)(unsigned long)arg.sb, sb, + vstruct_bytes(sb))) + ret = -EFAULT; +err: + if (!IS_ERR_OR_NULL(ca)) + percpu_ref_put(&ca->ref); + mutex_unlock(&c->sb_lock); + return ret; +} + +static long bch2_ioctl_disk_get_idx(struct bch_fs *c, + struct bch_ioctl_disk_get_idx arg) +{ + dev_t dev = huge_decode_dev(arg.dev); + struct bch_dev *ca; + unsigned i; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + for_each_online_member(ca, c, i) + if (ca->dev == dev) { + percpu_ref_put(&ca->io_ref); + return i; + } + + return -BCH_ERR_ENOENT_dev_idx_not_found; +} + +static long bch2_ioctl_disk_resize(struct bch_fs *c, + struct bch_ioctl_disk_resize arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_resize(c, ca, arg.nbuckets); + + percpu_ref_put(&ca->ref); + return ret; +} + +static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + struct bch_ioctl_disk_resize_journal arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; + + if (arg.nbuckets > U32_MAX) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); + + percpu_ref_put(&ca->ref); + return ret; +} + +#define BCH_IOCTL(_name, _argtype) \ +do { \ + _argtype i; \ + \ + if (copy_from_user(&i, arg, sizeof(i))) \ + return -EFAULT; \ + ret = bch2_ioctl_##_name(c, i); \ + goto out; \ +} while (0) + +long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) +{ + long ret; + + switch (cmd) { + case BCH_IOCTL_QUERY_UUID: + return bch2_ioctl_query_uuid(c, arg); + case BCH_IOCTL_FS_USAGE: + return bch2_ioctl_fs_usage(c, arg); + case BCH_IOCTL_DEV_USAGE: + return bch2_ioctl_dev_usage(c, arg); +#if 0 + case BCH_IOCTL_START: + BCH_IOCTL(start, struct bch_ioctl_start); + case BCH_IOCTL_STOP: + return bch2_ioctl_stop(c); +#endif + case BCH_IOCTL_READ_SUPER: + BCH_IOCTL(read_super, struct bch_ioctl_read_super); + case BCH_IOCTL_DISK_GET_IDX: + BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); + } + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + + switch (cmd) { + case BCH_IOCTL_DISK_ADD: + BCH_IOCTL(disk_add, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_REMOVE: + BCH_IOCTL(disk_remove, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_ONLINE: + BCH_IOCTL(disk_online, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_OFFLINE: + BCH_IOCTL(disk_offline, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_SET_STATE: + BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); + case BCH_IOCTL_DATA: + BCH_IOCTL(data, struct bch_ioctl_data); + case BCH_IOCTL_DISK_RESIZE: + BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); + case BCH_IOCTL_DISK_RESIZE_JOURNAL: + BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); + + default: + return -ENOTTY; + } +out: + if (ret < 0) + ret = bch2_err_class(ret); + return ret; +} + +static DEFINE_IDR(bch_chardev_minor); + +static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) +{ + unsigned minor = iminor(file_inode(filp)); + struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; + void __user *arg = (void __user *) v; + + return c + ? bch2_fs_ioctl(c, cmd, arg) + : bch2_global_ioctl(cmd, arg); +} + +static const struct file_operations bch_chardev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = bch2_chardev_ioctl, + .open = nonseekable_open, +}; + +static int bch_chardev_major; +static struct class *bch_chardev_class; +static struct device *bch_chardev; + +void bch2_fs_chardev_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->chardev)) + device_unregister(c->chardev); + if (c->minor >= 0) + idr_remove(&bch_chardev_minor, c->minor); +} + +int bch2_fs_chardev_init(struct bch_fs *c) +{ + c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); + if (c->minor < 0) + return c->minor; + + c->chardev = device_create(bch_chardev_class, NULL, + MKDEV(bch_chardev_major, c->minor), c, + "bcachefs%u-ctl", c->minor); + if (IS_ERR(c->chardev)) + return PTR_ERR(c->chardev); + + return 0; +} + +void bch2_chardev_exit(void) +{ + if (!IS_ERR_OR_NULL(bch_chardev_class)) + device_destroy(bch_chardev_class, + MKDEV(bch_chardev_major, U8_MAX)); + if (!IS_ERR_OR_NULL(bch_chardev_class)) + class_destroy(bch_chardev_class); + if (bch_chardev_major > 0) + unregister_chrdev(bch_chardev_major, "bcachefs"); +} + +int __init bch2_chardev_init(void) +{ + bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); + if (bch_chardev_major < 0) + return bch_chardev_major; + + bch_chardev_class = class_create("bcachefs"); + if (IS_ERR(bch_chardev_class)) + return PTR_ERR(bch_chardev_class); + + bch_chardev = device_create(bch_chardev_class, NULL, + MKDEV(bch_chardev_major, U8_MAX), + NULL, "bcachefs-ctl"); + if (IS_ERR(bch_chardev)) + return PTR_ERR(bch_chardev); + + return 0; +} + +#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h new file mode 100644 index 000000000000..0f563ca53c36 --- /dev/null +++ b/fs/bcachefs/chardev.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CHARDEV_H +#define _BCACHEFS_CHARDEV_H + +#ifndef NO_BCACHEFS_FS + +long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); + +void bch2_fs_chardev_exit(struct bch_fs *); +int bch2_fs_chardev_init(struct bch_fs *); + +void bch2_chardev_exit(void); +int __init bch2_chardev_init(void); + +#else + +static inline long bch2_fs_ioctl(struct bch_fs *c, + unsigned cmd, void __user * arg) +{ + return -ENOTTY; +} + +static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} +static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } + +static inline void bch2_chardev_exit(void) {} +static inline int __init bch2_chardev_init(void) { return 0; } + +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c new file mode 100644 index 000000000000..3c761ad6b1c8 --- /dev/null +++ b/fs/bcachefs/checksum.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" +#include "errcode.h" +#include "super.h" +#include "super-io.h" + +#include <linux/crc32c.h> +#include <linux/crypto.h> +#include <linux/xxhash.h> +#include <linux/key.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <crypto/algapi.h> +#include <crypto/chacha.h> +#include <crypto/hash.h> +#include <crypto/poly1305.h> +#include <crypto/skcipher.h> +#include <keys/user-type.h> + +/* + * bch2_checksum state is an abstraction of the checksum state calculated over different pages. + * it features page merging without having the checksum algorithm lose its state. + * for native checksum aglorithms (like crc), a default seed value will do. + * for hash-like algorithms, a state needs to be stored + */ + +struct bch2_checksum_state { + union { + u64 seed; + struct xxh64_state h64state; + }; + unsigned int type; +}; + +static void bch2_checksum_init(struct bch2_checksum_state *state) +{ + switch (state->type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + state->seed = 0; + break; + case BCH_CSUM_crc32c_nonzero: + state->seed = U32_MAX; + break; + case BCH_CSUM_crc64_nonzero: + state->seed = U64_MAX; + break; + case BCH_CSUM_xxhash: + xxh64_reset(&state->h64state, 0); + break; + default: + BUG(); + } +} + +static u64 bch2_checksum_final(const struct bch2_checksum_state *state) +{ + switch (state->type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + return state->seed; + case BCH_CSUM_crc32c_nonzero: + return state->seed ^ U32_MAX; + case BCH_CSUM_crc64_nonzero: + return state->seed ^ U64_MAX; + case BCH_CSUM_xxhash: + return xxh64_digest(&state->h64state); + default: + BUG(); + } +} + +static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) +{ + switch (state->type) { + case BCH_CSUM_none: + return; + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc32c: + state->seed = crc32c(state->seed, data, len); + break; + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc64: + state->seed = crc64_be(state->seed, data, len); + break; + case BCH_CSUM_xxhash: + xxh64_update(&state->h64state, data, len); + break; + default: + BUG(); + } +} + +static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + struct scatterlist *sg, size_t len) +{ + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + int ret; + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_crypt(req, sg, sg, len, nonce.d); + + ret = crypto_skcipher_encrypt(req); + if (ret) + pr_err("got error %i from crypto_skcipher_encrypt()", ret); + + return ret; +} + +static inline int do_encrypt(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + void *buf, size_t len) +{ + if (!is_vmalloc_addr(buf)) { + struct scatterlist sg; + + sg_init_table(&sg, 1); + sg_set_page(&sg, + is_vmalloc_addr(buf) + ? vmalloc_to_page(buf) + : virt_to_page(buf), + len, offset_in_page(buf)); + return do_encrypt_sg(tfm, nonce, &sg, len); + } else { + unsigned pages = buf_pages(buf, len); + struct scatterlist *sg; + size_t orig_len = len; + int ret, i; + + sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); + if (!sg) + return -BCH_ERR_ENOMEM_do_encrypt; + + sg_init_table(sg, pages); + + for (i = 0; i < pages; i++) { + unsigned offset = offset_in_page(buf); + unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset); + + sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); + buf += pg_len; + len -= pg_len; + } + + ret = do_encrypt_sg(tfm, nonce, sg, orig_len); + kfree(sg); + return ret; + } +} + +int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, + void *buf, size_t len) +{ + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); + int ret; + + ret = PTR_ERR_OR_ZERO(chacha20); + if (ret) { + pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); + return ret; + } + + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); + if (ret) { + pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); + goto err; + } + + ret = do_encrypt(chacha20, nonce, buf, len); +err: + crypto_free_sync_skcipher(chacha20); + return ret; +} + +static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, + struct nonce nonce) +{ + u8 key[POLY1305_KEY_SIZE]; + int ret; + + nonce.d[3] ^= BCH_NONCE_POLY; + + memset(key, 0, sizeof(key)); + ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); + if (ret) + return ret; + + desc->tfm = c->poly1305; + crypto_shash_init(desc); + crypto_shash_update(desc, key, sizeof(key)); + return 0; +} + +struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + struct nonce nonce, const void *data, size_t len) +{ + switch (type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; + + bch2_checksum_init(&state); + bch2_checksum_update(&state, data, len); + + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + + crypto_shash_update(desc, data, len); + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} + +int bch2_encrypt(struct bch_fs *c, unsigned type, + struct nonce nonce, void *data, size_t len) +{ + if (!bch2_csum_type_is_encryption(type)) + return 0; + + return do_encrypt(c->chacha20, nonce, data, len); +} + +static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio, + struct bvec_iter *iter) +{ + struct bio_vec bv; + + switch (type) { + case BCH_CSUM_none: + return (struct bch_csum) { 0 }; + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; + bch2_checksum_init(&state); + +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { + void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; + + bch2_checksum_update(&state, p, bv.bv_len); + kunmap_local(p); + } +#else + __bio_for_each_bvec(bv, bio, *iter, *iter) + bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, + bv.bv_len); +#endif + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { + void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; + + crypto_shash_update(desc, p, bv.bv_len); + kunmap_local(p); + } +#else + __bio_for_each_bvec(bv, bio, *iter, *iter) + crypto_shash_update(desc, + page_address(bv.bv_page) + bv.bv_offset, + bv.bv_len); +#endif + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} + +struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bvec_iter iter = bio->bi_iter; + + return __bch2_checksum_bio(c, type, nonce, bio, &iter); +} + +int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bio_vec bv; + struct bvec_iter iter; + struct scatterlist sgl[16], *sg = sgl; + size_t bytes = 0; + int ret = 0; + + if (!bch2_csum_type_is_encryption(type)) + return 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + + bio_for_each_segment(bv, bio, iter) { + if (sg == sgl + ARRAY_SIZE(sgl)) { + sg_mark_end(sg - 1); + + ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + if (ret) + return ret; + + nonce = nonce_add(nonce, bytes); + bytes = 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + sg = sgl; + } + + sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); + bytes += bv.bv_len; + } + + sg_mark_end(sg - 1); + return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); +} + +struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, + struct bch_csum b, size_t b_len) +{ + struct bch2_checksum_state state; + + state.type = type; + bch2_checksum_init(&state); + state.seed = le64_to_cpu(a.lo); + + BUG_ON(!bch2_checksum_mergeable(type)); + + while (b_len) { + unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE); + + bch2_checksum_update(&state, + page_address(ZERO_PAGE(0)), page_len); + b_len -= page_len; + } + a.lo = cpu_to_le64(bch2_checksum_final(&state)); + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; +} + +int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + struct bversion version, + struct bch_extent_crc_unpacked crc_old, + struct bch_extent_crc_unpacked *crc_a, + struct bch_extent_crc_unpacked *crc_b, + unsigned len_a, unsigned len_b, + unsigned new_csum_type) +{ + struct bvec_iter iter = bio->bi_iter; + struct nonce nonce = extent_nonce(version, crc_old); + struct bch_csum merged = { 0 }; + struct crc_split { + struct bch_extent_crc_unpacked *crc; + unsigned len; + unsigned csum_type; + struct bch_csum csum; + } splits[3] = { + { crc_a, len_a, new_csum_type, { 0 }}, + { crc_b, len_b, new_csum_type, { 0 } }, + { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } }, + }, *i; + bool mergeable = crc_old.csum_type == new_csum_type && + bch2_checksum_mergeable(new_csum_type); + unsigned crc_nonce = crc_old.nonce; + + BUG_ON(len_a + len_b > bio_sectors(bio)); + BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); + BUG_ON(crc_is_compressed(crc_old)); + BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)); + + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { + iter.bi_size = i->len << 9; + if (mergeable || i->crc) + i->csum = __bch2_checksum_bio(c, i->csum_type, + nonce, bio, &iter); + else + bio_advance_iter(bio, &iter, i->len << 9); + nonce = nonce_add(nonce, i->len << 9); + } + + if (mergeable) + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) + merged = bch2_checksum_merge(new_csum_type, merged, + i->csum, i->len << 9); + else + merged = bch2_checksum_bio(c, crc_old.csum_type, + extent_nonce(version, crc_old), bio); + + if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { + bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", + __func__, + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo, + bch2_csum_types[crc_old.csum_type], + bch2_csum_types[new_csum_type]); + return -EIO; + } + + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { + if (i->crc) + *i->crc = (struct bch_extent_crc_unpacked) { + .csum_type = i->csum_type, + .compression_type = crc_old.compression_type, + .compressed_size = i->len, + .uncompressed_size = i->len, + .offset = 0, + .live_size = i->len, + .nonce = crc_nonce, + .csum = i->csum, + }; + + if (bch2_csum_type_is_encryption(new_csum_type)) + crc_nonce += i->len; + } + + return 0; +} + +/* BCH_SB_FIELD_crypt: */ + +static int bch2_sb_crypt_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&crypt->field), sizeof(*crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + if (BCH_CRYPT_KDF_TYPE(crypt)) { + prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + return 0; +} + +static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); + prt_newline(out); + prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); + prt_newline(out); + prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); + prt_newline(out); + prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + .validate = bch2_sb_crypt_validate, + .to_text = bch2_sb_crypt_to_text, +}; + +#ifdef __KERNEL__ +static int __bch2_request_key(char *key_description, struct bch_key *key) +{ + struct key *keyring_key; + const struct user_key_payload *ukp; + int ret; + + keyring_key = request_key(&key_type_user, key_description, NULL); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + down_read(&keyring_key->sem); + ukp = dereference_key_locked(keyring_key); + if (ukp->datalen == sizeof(*key)) { + memcpy(key, ukp->data, ukp->datalen); + ret = 0; + } else { + ret = -EINVAL; + } + up_read(&keyring_key->sem); + key_put(keyring_key); + + return ret; +} +#else +#include <keyutils.h> + +static int __bch2_request_key(char *key_description, struct bch_key *key) +{ + key_serial_t key_id; + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_SESSION_KEYRING); + if (key_id >= 0) + goto got_key; + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_KEYRING); + if (key_id >= 0) + goto got_key; + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_SESSION_KEYRING); + if (key_id >= 0) + goto got_key; + + return -errno; +got_key: + + if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) + return -1; + + return 0; +} + +#include "../crypto.h" +#endif + +int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +{ + struct printbuf key_description = PRINTBUF; + int ret; + + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); + + ret = __bch2_request_key(key_description.buf, key); + printbuf_exit(&key_description); + +#ifndef __KERNEL__ + if (ret) { + char *passphrase = read_passphrase("Enter passphrase: "); + struct bch_encrypted_key sb_key; + + bch2_passphrase_check(sb, passphrase, + key, &sb_key); + ret = 0; + } +#endif + + /* stash with memfd, pass memfd fd to mount */ + + return ret; +} + +#ifndef __KERNEL__ +int bch2_revoke_key(struct bch_sb *sb) +{ + key_serial_t key_id; + struct printbuf key_description = PRINTBUF; + + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); + + key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); + printbuf_exit(&key_description); + if (key_id < 0) + return errno; + + keyctl_revoke(key_id); + + return 0; +} +#endif + +int bch2_decrypt_sb_key(struct bch_fs *c, + struct bch_sb_field_crypt *crypt, + struct bch_key *key) +{ + struct bch_encrypted_key sb_key = crypt->key; + struct bch_key user_key; + int ret = 0; + + /* is key encrypted? */ + if (!bch2_key_is_encrypted(&sb_key)) + goto out; + + ret = bch2_request_key(c->disk_sb.sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); + goto err; + } + + /* decrypt real key: */ + ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), + &sb_key, sizeof(sb_key)); + if (ret) + goto err; + + if (bch2_key_is_encrypted(&sb_key)) { + bch_err(c, "incorrect encryption key"); + ret = -EINVAL; + goto err; + } +out: + *key = sb_key.key; +err: + memzero_explicit(&sb_key, sizeof(sb_key)); + memzero_explicit(&user_key, sizeof(user_key)); + return ret; +} + +static int bch2_alloc_ciphers(struct bch_fs *c) +{ + int ret; + + if (!c->chacha20) + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); + ret = PTR_ERR_OR_ZERO(c->chacha20); + + if (ret) { + bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); + return ret; + } + + if (!c->poly1305) + c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); + ret = PTR_ERR_OR_ZERO(c->poly1305); + + if (ret) { + bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); + return ret; + } + + return 0; +} + +int bch2_disable_encryption(struct bch_fs *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); + if (!crypt) + goto out; + + /* is key encrypted? */ + ret = 0; + if (bch2_key_is_encrypted(&crypt->key)) + goto out; + + ret = bch2_decrypt_sb_key(c, crypt, &key); + if (ret) + goto out; + + crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); + crypt->key.key = key; + + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_enable_encryption(struct bch_fs *c, bool keyed) +{ + struct bch_encrypted_key key; + struct bch_key user_key; + struct bch_sb_field_crypt *crypt; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + /* Do we already have an encryption key? */ + if (bch2_sb_field_get(c->disk_sb.sb, crypt)) + goto err; + + ret = bch2_alloc_ciphers(c); + if (ret) + goto err; + + key.magic = cpu_to_le64(BCH_KEY_MAGIC); + get_random_bytes(&key.key, sizeof(key.key)); + + if (keyed) { + ret = bch2_request_key(c->disk_sb.sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); + goto err; + } + + ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), + &key, sizeof(key)); + if (ret) + goto err; + } + + ret = crypto_skcipher_setkey(&c->chacha20->base, + (void *) &key.key, sizeof(key.key)); + if (ret) + goto err; + + crypt = bch2_sb_field_resize(&c->disk_sb, crypt, + sizeof(*crypt) / sizeof(u64)); + if (!crypt) { + ret = -BCH_ERR_ENOSPC_sb_crypt; + goto err; + } + + crypt->key = key; + + /* write superblock */ + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); + bch2_write_super(c); +err: + mutex_unlock(&c->sb_lock); + memzero_explicit(&user_key, sizeof(user_key)); + memzero_explicit(&key, sizeof(key)); + return ret; +} + +void bch2_fs_encryption_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->poly1305)) + crypto_free_shash(c->poly1305); + if (!IS_ERR_OR_NULL(c->chacha20)) + crypto_free_sync_skcipher(c->chacha20); + if (!IS_ERR_OR_NULL(c->sha256)) + crypto_free_shash(c->sha256); +} + +int bch2_fs_encryption_init(struct bch_fs *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret = 0; + + c->sha256 = crypto_alloc_shash("sha256", 0, 0); + ret = PTR_ERR_OR_ZERO(c->sha256); + if (ret) { + bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); + goto out; + } + + crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); + if (!crypt) + goto out; + + ret = bch2_alloc_ciphers(c); + if (ret) + goto out; + + ret = bch2_decrypt_sb_key(c, crypt, &key); + if (ret) + goto out; + + ret = crypto_skcipher_setkey(&c->chacha20->base, + (void *) &key.key, sizeof(key.key)); + if (ret) + goto out; +out: + memzero_explicit(&key, sizeof(key)); + return ret; +} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h new file mode 100644 index 000000000000..13998388c545 --- /dev/null +++ b/fs/bcachefs/checksum.h @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CHECKSUM_H +#define _BCACHEFS_CHECKSUM_H + +#include "bcachefs.h" +#include "extents_types.h" +#include "super-io.h" + +#include <linux/crc64.h> +#include <crypto/chacha.h> + +static inline bool bch2_checksum_mergeable(unsigned type) +{ + + switch (type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + return true; + default: + return false; + } +} + +struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, + struct bch_csum, size_t); + +#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) +#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) +#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) +#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) +#define BCH_NONCE_POLY cpu_to_le32(1 << 31) + +struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, + const void *, size_t); + +/* + * This is used for various on disk data structures - bch_sb, prio_set, bset, + * jset: The checksum is _always_ the first field of these structs + */ +#define csum_vstruct(_c, _type, _nonce, _i) \ +({ \ + const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\ + \ + bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\ +}) + +int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); +int bch2_request_key(struct bch_sb *, struct bch_key *); +#ifndef __KERNEL__ +int bch2_revoke_key(struct bch_sb *); +#endif + +int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, + void *data, size_t); + +struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); + +int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, + struct bch_extent_crc_unpacked, + struct bch_extent_crc_unpacked *, + struct bch_extent_crc_unpacked *, + unsigned, unsigned, unsigned); + +int __bch2_encrypt_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); + +static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + return bch2_csum_type_is_encryption(type) + ? __bch2_encrypt_bio(c, type, nonce, bio) + : 0; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; + +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, + struct bch_key *); + +int bch2_disable_encryption(struct bch_fs *); +int bch2_enable_encryption(struct bch_fs *, bool); + +void bch2_fs_encryption_exit(struct bch_fs *); +int bch2_fs_encryption_init(struct bch_fs *); + +static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, + bool data) +{ + switch (type) { + case BCH_CSUM_OPT_none: + return BCH_CSUM_none; + case BCH_CSUM_OPT_crc32c: + return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; + case BCH_CSUM_OPT_crc64: + return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; + case BCH_CSUM_OPT_xxhash: + return BCH_CSUM_xxhash; + default: + BUG(); + } +} + +static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, + struct bch_io_opts opts) +{ + if (opts.nocow) + return 0; + + if (c->sb.encryption_type) + return c->opts.wide_macs + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; + + return bch2_csum_opt_to_type(opts.data_checksum, true); +} + +static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) +{ + if (c->sb.encryption_type) + return BCH_CSUM_chacha20_poly1305_128; + + return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); +} + +static inline bool bch2_checksum_type_valid(const struct bch_fs *c, + unsigned type) +{ + if (type >= BCH_CSUM_NR) + return false; + + if (bch2_csum_type_is_encryption(type) && !c->chacha20) + return false; + + return true; +} + +/* returns true if not equal */ +static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) +{ + /* + * XXX: need some way of preventing the compiler from optimizing this + * into a form that isn't constant time.. + */ + return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; +} + +/* for skipping ahead and encrypting/decrypting at an offset: */ +static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) +{ + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); + + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); + return nonce; +} + +static inline struct nonce null_nonce(void) +{ + struct nonce ret; + + memset(&ret, 0, sizeof(ret)); + return ret; +} + +static inline struct nonce extent_nonce(struct bversion version, + struct bch_extent_crc_unpacked crc) +{ + unsigned compression_type = crc_is_compressed(crc) + ? crc.compression_type + : 0; + unsigned size = compression_type ? crc.uncompressed_size : 0; + struct nonce nonce = (struct nonce) {{ + [0] = cpu_to_le32(size << 22), + [1] = cpu_to_le32(version.lo), + [2] = cpu_to_le32(version.lo >> 32), + [3] = cpu_to_le32(version.hi| + (compression_type << 24))^BCH_NONCE_EXTENT, + }}; + + return nonce_add(nonce, crc.nonce << 9); +} + +static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) +{ + return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; +} + +static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) +{ + __le64 magic = __bch2_sb_magic(sb); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + +static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) +{ + __le64 magic = bch2_sb_magic(c); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + +#endif /* _BCACHEFS_CHECKSUM_H */ diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c new file mode 100644 index 000000000000..f41889093a2c --- /dev/null +++ b/fs/bcachefs/clock.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "clock.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/preempt.h> + +static inline long io_timer_cmp(io_timer_heap *h, + struct io_timer *l, + struct io_timer *r) +{ + return l->expire - r->expire; +} + +void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) +{ + size_t i; + + spin_lock(&clock->timer_lock); + + if (time_after_eq((unsigned long) atomic64_read(&clock->now), + timer->expire)) { + spin_unlock(&clock->timer_lock); + timer->fn(timer); + return; + } + + for (i = 0; i < clock->timers.used; i++) + if (clock->timers.data[i] == timer) + goto out; + + BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); +out: + spin_unlock(&clock->timer_lock); +} + +void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) +{ + size_t i; + + spin_lock(&clock->timer_lock); + + for (i = 0; i < clock->timers.used; i++) + if (clock->timers.data[i] == timer) { + heap_del(&clock->timers, i, io_timer_cmp, NULL); + break; + } + + spin_unlock(&clock->timer_lock); +} + +struct io_clock_wait { + struct io_timer io_timer; + struct timer_list cpu_timer; + struct task_struct *task; + int expired; +}; + +static void io_clock_wait_fn(struct io_timer *timer) +{ + struct io_clock_wait *wait = container_of(timer, + struct io_clock_wait, io_timer); + + wait->expired = 1; + wake_up_process(wait->task); +} + +static void io_clock_cpu_timeout(struct timer_list *timer) +{ + struct io_clock_wait *wait = container_of(timer, + struct io_clock_wait, cpu_timer); + + wait->expired = 1; + wake_up_process(wait->task); +} + +void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) +{ + struct io_clock_wait wait; + + /* XXX: calculate sleep time rigorously */ + wait.io_timer.expire = until; + wait.io_timer.fn = io_clock_wait_fn; + wait.task = current; + wait.expired = 0; + bch2_io_timer_add(clock, &wait.io_timer); + + schedule(); + + bch2_io_timer_del(clock, &wait.io_timer); +} + +void bch2_kthread_io_clock_wait(struct io_clock *clock, + unsigned long io_until, + unsigned long cpu_timeout) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct io_clock_wait wait; + + wait.io_timer.expire = io_until; + wait.io_timer.fn = io_clock_wait_fn; + wait.task = current; + wait.expired = 0; + bch2_io_timer_add(clock, &wait.io_timer); + + timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); + + if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) + mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread && kthread_should_stop()) + break; + + if (wait.expired) + break; + + schedule(); + try_to_freeze(); + } + + __set_current_state(TASK_RUNNING); + del_timer_sync(&wait.cpu_timer); + destroy_timer_on_stack(&wait.cpu_timer); + bch2_io_timer_del(clock, &wait.io_timer); +} + +static struct io_timer *get_expired_timer(struct io_clock *clock, + unsigned long now) +{ + struct io_timer *ret = NULL; + + spin_lock(&clock->timer_lock); + + if (clock->timers.used && + time_after_eq(now, clock->timers.data[0]->expire)) + heap_pop(&clock->timers, ret, io_timer_cmp, NULL); + + spin_unlock(&clock->timer_lock); + + return ret; +} + +void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) +{ + struct io_timer *timer; + unsigned long now = atomic64_add_return(sectors, &clock->now); + + while ((timer = get_expired_timer(clock, now))) + timer->fn(timer); +} + +void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) +{ + unsigned long now; + unsigned i; + + out->atomic++; + spin_lock(&clock->timer_lock); + now = atomic64_read(&clock->now); + + for (i = 0; i < clock->timers.used; i++) + prt_printf(out, "%ps:\t%li\n", + clock->timers.data[i]->fn, + clock->timers.data[i]->expire - now); + spin_unlock(&clock->timer_lock); + --out->atomic; +} + +void bch2_io_clock_exit(struct io_clock *clock) +{ + free_heap(&clock->timers); + free_percpu(clock->pcpu_buf); +} + +int bch2_io_clock_init(struct io_clock *clock) +{ + atomic64_set(&clock->now, 0); + spin_lock_init(&clock->timer_lock); + + clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); + + clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); + if (!clock->pcpu_buf) + return -BCH_ERR_ENOMEM_io_clock_init; + + if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) + return -BCH_ERR_ENOMEM_io_clock_init; + + return 0; +} diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h new file mode 100644 index 000000000000..70a0f7436c84 --- /dev/null +++ b/fs/bcachefs/clock.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CLOCK_H +#define _BCACHEFS_CLOCK_H + +void bch2_io_timer_add(struct io_clock *, struct io_timer *); +void bch2_io_timer_del(struct io_clock *, struct io_timer *); +void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, + unsigned long); + +void __bch2_increment_clock(struct io_clock *, unsigned); + +static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, + int rw) +{ + struct io_clock *clock = &c->io_clock[rw]; + + if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= + IO_CLOCK_PCPU_SECTORS)) + __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); +} + +void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); + +#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_timeout(wq, condition, timeout); \ + __ret; \ +}) + +void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); + +void bch2_io_clock_exit(struct io_clock *); +int bch2_io_clock_init(struct io_clock *); + +#endif /* _BCACHEFS_CLOCK_H */ diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h new file mode 100644 index 000000000000..5fae0012d808 --- /dev/null +++ b/fs/bcachefs/clock_types.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CLOCK_TYPES_H +#define _BCACHEFS_CLOCK_TYPES_H + +#include "util.h" + +#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) + +/* + * Clocks/timers in units of sectors of IO: + * + * Note - they use percpu batching, so they're only approximate. + */ + +struct io_timer; +typedef void (*io_timer_fn)(struct io_timer *); + +struct io_timer { + io_timer_fn fn; + unsigned long expire; +}; + +/* Amount to buffer up on a percpu counter */ +#define IO_CLOCK_PCPU_SECTORS 128 + +typedef HEAP(struct io_timer *) io_timer_heap; + +struct io_clock { + atomic64_t now; + u16 __percpu *pcpu_buf; + unsigned max_slop; + + spinlock_t timer_lock; + io_timer_heap timers; +}; + +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 index 000000000000..51af8ea230ed --- /dev/null +++ b/fs/bcachefs/compress.c @@ -0,0 +1,732 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" +#include "compress.h" +#include "extents.h" +#include "super-io.h" + +#include <linux/lz4.h> +#include <linux/zlib.h> +#include <linux/zstd.h> + +/* Bounce buffer: */ +struct bbuf { + void *b; + enum { + BB_NONE, + BB_VMAP, + BB_KMALLOC, + BB_MEMPOOL, + } type; + int rw; +}; + +static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) +{ + void *b; + + BUG_ON(size > c->opts.encoded_extent_max); + + b = kmalloc(size, GFP_NOFS|__GFP_NOWARN); + if (b) + return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; + + b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS); + if (b) + return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; + + BUG(); +} + +static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) +{ + struct bio_vec bv; + struct bvec_iter iter; + void *expected_start = NULL; + + __bio_for_each_bvec(bv, bio, iter, start) { + if (expected_start && + expected_start != page_address(bv.bv_page) + bv.bv_offset) + return false; + + expected_start = page_address(bv.bv_page) + + bv.bv_offset + bv.bv_len; + } + + return true; +} + +static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + struct bvec_iter start, int rw) +{ + struct bbuf ret; + struct bio_vec bv; + struct bvec_iter iter; + unsigned nr_pages = 0; + struct page *stack_pages[16]; + struct page **pages = NULL; + void *data; + + BUG_ON(start.bi_size > c->opts.encoded_extent_max); + + if (!PageHighMem(bio_iter_page(bio, start)) && + bio_phys_contig(bio, start)) + return (struct bbuf) { + .b = page_address(bio_iter_page(bio, start)) + + bio_iter_offset(bio, start), + .type = BB_NONE, .rw = rw + }; + + /* check if we can map the pages contiguously: */ + __bio_for_each_segment(bv, bio, iter, start) { + if (iter.bi_size != start.bi_size && + bv.bv_offset) + goto bounce; + + if (bv.bv_len < iter.bi_size && + bv.bv_offset + bv.bv_len < PAGE_SIZE) + goto bounce; + + nr_pages++; + } + + BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); + + pages = nr_pages > ARRAY_SIZE(stack_pages) + ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) + : stack_pages; + if (!pages) + goto bounce; + + nr_pages = 0; + __bio_for_each_segment(bv, bio, iter, start) + pages[nr_pages++] = bv.bv_page; + + data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (pages != stack_pages) + kfree(pages); + + if (data) + return (struct bbuf) { + .b = data + bio_iter_offset(bio, start), + .type = BB_VMAP, .rw = rw + }; +bounce: + ret = __bounce_alloc(c, start.bi_size, rw); + + if (rw == READ) + memcpy_from_bio(ret.b, bio, start); + + return ret; +} + +static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) +{ + return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); +} + +static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) +{ + switch (buf.type) { + case BB_NONE: + break; + case BB_VMAP: + vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); + break; + case BB_KMALLOC: + kfree(buf.b); + break; + case BB_MEMPOOL: + mempool_free(buf.b, &c->compression_bounce[buf.rw]); + break; + } +} + +static inline void zlib_set_workspace(z_stream *strm, void *workspace) +{ +#ifdef __KERNEL__ + strm->workspace = workspace; +#endif +} + +static int __bio_uncompress(struct bch_fs *c, struct bio *src, + void *dst_data, struct bch_extent_crc_unpacked crc) +{ + struct bbuf src_data = { NULL }; + size_t src_len = src->bi_iter.bi_size; + size_t dst_len = crc.uncompressed_size << 9; + void *workspace; + int ret; + + src_data = bio_map_or_bounce(c, src, READ); + + switch (crc.compression_type) { + case BCH_COMPRESSION_TYPE_lz4_old: + case BCH_COMPRESSION_TYPE_lz4: + ret = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret != dst_len) + goto err; + break; + case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { + .next_in = src_data.b, + .avail_in = src_len, + .next_out = dst_data, + .avail_out = dst_len, + }; + + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + + zlib_set_workspace(&strm, workspace); + zlib_inflateInit2(&strm, -MAX_WBITS); + ret = zlib_inflate(&strm, Z_FINISH); + + mempool_free(workspace, &c->decompress_workspace); + + if (ret != Z_STREAM_END) + goto err; + break; + } + case BCH_COMPRESSION_TYPE_zstd: { + ZSTD_DCtx *ctx; + size_t real_src_len = le32_to_cpup(src_data.b); + + if (real_src_len > src_len - 4) + goto err; + + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); + + ret = zstd_decompress_dctx(ctx, + dst_data, dst_len, + src_data.b + 4, real_src_len); + + mempool_free(workspace, &c->decompress_workspace); + + if (ret != dst_len) + goto err; + break; + } + default: + BUG(); + } + ret = 0; +out: + bio_unmap_or_unbounce(c, src_data); + return ret; +err: + ret = -EIO; + goto out; +} + +int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + struct bch_extent_crc_unpacked *crc) +{ + struct bbuf data = { NULL }; + size_t dst_len = crc->uncompressed_size << 9; + + /* bio must own its pages: */ + BUG_ON(!bio->bi_vcnt); + BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); + + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || + crc->compressed_size << 9 > c->opts.encoded_extent_max) { + bch_err(c, "error rewriting existing data: extent too big"); + return -EIO; + } + + data = __bounce_alloc(c, dst_len, WRITE); + + if (__bio_uncompress(c, bio, data.b, *crc)) { + if (!c->opts.no_data_io) + bch_err(c, "error rewriting existing data: decompression error"); + bio_unmap_or_unbounce(c, data); + return -EIO; + } + + /* + * XXX: don't have a good way to assert that the bio was allocated with + * enough space, we depend on bch2_move_extent doing the right thing + */ + bio->bi_iter.bi_size = crc->live_size << 9; + + memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); + + crc->csum_type = 0; + crc->compression_type = 0; + crc->compressed_size = crc->live_size; + crc->uncompressed_size = crc->live_size; + crc->offset = 0; + crc->csum = (struct bch_csum) { 0, 0 }; + + bio_unmap_or_unbounce(c, data); + return 0; +} + +int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, + struct bio *dst, struct bvec_iter dst_iter, + struct bch_extent_crc_unpacked crc) +{ + struct bbuf dst_data = { NULL }; + size_t dst_len = crc.uncompressed_size << 9; + int ret; + + if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || + crc.compressed_size << 9 > c->opts.encoded_extent_max) + return -EIO; + + dst_data = dst_len == dst_iter.bi_size + ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) + : __bounce_alloc(c, dst_len, WRITE); + + ret = __bio_uncompress(c, src, dst_data.b, crc); + if (ret) + goto err; + + if (dst_data.type != BB_NONE && + dst_data.type != BB_VMAP) + memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); +err: + bio_unmap_or_unbounce(c, dst_data); + return ret; +} + +static int attempt_compress(struct bch_fs *c, + void *workspace, + void *dst, size_t dst_len, + void *src, size_t src_len, + struct bch_compression_opt compression) +{ + enum bch_compression_type compression_type = + __bch2_compression_opt_to_type[compression.type]; + + switch (compression_type) { + case BCH_COMPRESSION_TYPE_lz4: + if (compression.level < LZ4HC_MIN_CLEVEL) { + int len = src_len; + int ret = LZ4_compress_destSize( + src, dst, + &len, dst_len, + workspace); + if (len < src_len) + return -len; + + return ret; + } else { + int ret = LZ4_compress_HC( + src, dst, + src_len, dst_len, + compression.level, + workspace); + + return ret ?: -1; + } + case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { + .next_in = src, + .avail_in = src_len, + .next_out = dst, + .avail_out = dst_len, + }; + + zlib_set_workspace(&strm, workspace); + zlib_deflateInit2(&strm, + compression.level + ? clamp_t(unsigned, compression.level, + Z_BEST_SPEED, Z_BEST_COMPRESSION) + : Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + + if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) + return 0; + + if (zlib_deflateEnd(&strm) != Z_OK) + return 0; + + return strm.total_out; + } + case BCH_COMPRESSION_TYPE_zstd: { + /* + * rescale: + * zstd max compression level is 22, our max level is 15 + */ + unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); + ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, c->zstd_workspace_size); + + /* + * ZSTD requires that when we decompress we pass in the exact + * compressed size - rounding it up to the nearest sector + * doesn't work, so we use the first 4 bytes of the buffer for + * that. + * + * Additionally, the ZSTD code seems to have a bug where it will + * write just past the end of the buffer - so subtract a fudge + * factor (7 bytes) from the dst buffer size to account for + * that. + */ + size_t len = zstd_compress_cctx(ctx, + dst + 4, dst_len - 4 - 7, + src, src_len, + ¶ms); + if (zstd_is_error(len)) + return 0; + + *((__le32 *) dst) = cpu_to_le32(len); + return len + 4; + } + default: + BUG(); + } +} + +static unsigned __bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + struct bch_compression_opt compression) +{ + struct bbuf src_data = { NULL }, dst_data = { NULL }; + void *workspace; + enum bch_compression_type compression_type = + __bch2_compression_opt_to_type[compression.type]; + unsigned pad; + int ret = 0; + + BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); + BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + + /* If it's only one block, don't bother trying to compress: */ + if (src->bi_iter.bi_size <= c->opts.block_size) + return BCH_COMPRESSION_TYPE_incompressible; + + dst_data = bio_map_or_bounce(c, dst, WRITE); + src_data = bio_map_or_bounce(c, src, READ); + + workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); + + *src_len = src->bi_iter.bi_size; + *dst_len = dst->bi_iter.bi_size; + + /* + * XXX: this algorithm sucks when the compression code doesn't tell us + * how much would fit, like LZ4 does: + */ + while (1) { + if (*src_len <= block_bytes(c)) { + ret = -1; + break; + } + + ret = attempt_compress(c, workspace, + dst_data.b, *dst_len, + src_data.b, *src_len, + compression); + if (ret > 0) { + *dst_len = ret; + ret = 0; + break; + } + + /* Didn't fit: should we retry with a smaller amount? */ + if (*src_len <= *dst_len) { + ret = -1; + break; + } + + /* + * If ret is negative, it's a hint as to how much data would fit + */ + BUG_ON(-ret >= *src_len); + + if (ret < 0) + *src_len = -ret; + else + *src_len -= (*src_len - *dst_len) / 2; + *src_len = round_down(*src_len, block_bytes(c)); + } + + mempool_free(workspace, &c->compress_workspace[compression_type]); + + if (ret) + goto err; + + /* Didn't get smaller: */ + if (round_up(*dst_len, block_bytes(c)) >= *src_len) + goto err; + + pad = round_up(*dst_len, block_bytes(c)) - *dst_len; + + memset(dst_data.b + *dst_len, 0, pad); + *dst_len += pad; + + if (dst_data.type != BB_NONE && + dst_data.type != BB_VMAP) + memcpy_to_bio(dst, dst->bi_iter, dst_data.b); + + BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); + BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); + BUG_ON(*dst_len & (block_bytes(c) - 1)); + BUG_ON(*src_len & (block_bytes(c) - 1)); + ret = compression_type; +out: + bio_unmap_or_unbounce(c, src_data); + bio_unmap_or_unbounce(c, dst_data); + return ret; +err: + ret = BCH_COMPRESSION_TYPE_incompressible; + goto out; +} + +unsigned bch2_bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + unsigned compression_opt) +{ + unsigned orig_dst = dst->bi_iter.bi_size; + unsigned orig_src = src->bi_iter.bi_size; + unsigned compression_type; + + /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ + src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, + c->opts.encoded_extent_max); + /* Don't generate a bigger output than input: */ + dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + + compression_type = + __bio_compress(c, dst, dst_len, src, src_len, + bch2_compression_decode(compression_opt)); + + dst->bi_iter.bi_size = orig_dst; + src->bi_iter.bi_size = orig_src; + return compression_type; +} + +static int __bch2_fs_compress_init(struct bch_fs *, u64); + +#define BCH_FEATURE_none 0 + +static const unsigned bch2_compression_opt_to_feature[] = { +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, + BCH_COMPRESSION_OPTS() +#undef x +}; + +#undef BCH_FEATURE_none + +static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) +{ + int ret = 0; + + if ((c->sb.features & f) == f) + return 0; + + mutex_lock(&c->sb_lock); + + if ((c->sb.features & f) == f) { + mutex_unlock(&c->sb_lock); + return 0; + } + + ret = __bch2_fs_compress_init(c, c->sb.features|f); + if (ret) { + mutex_unlock(&c->sb_lock); + return ret; + } + + c->disk_sb.sb->features[0] |= cpu_to_le64(f); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_check_set_has_compressed_data(struct bch_fs *c, + unsigned compression_opt) +{ + unsigned compression_type = bch2_compression_decode(compression_opt).type; + + BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); + + return compression_type + ? __bch2_check_set_has_compressed_data(c, + 1ULL << bch2_compression_opt_to_feature[compression_type]) + : 0; +} + +void bch2_fs_compress_exit(struct bch_fs *c) +{ + unsigned i; + + mempool_exit(&c->decompress_workspace); + for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) + mempool_exit(&c->compress_workspace[i]); + mempool_exit(&c->compression_bounce[WRITE]); + mempool_exit(&c->compression_bounce[READ]); +} + +static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) +{ + size_t decompress_workspace_size = 0; + ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), + c->opts.encoded_extent_max); + + /* + * ZSTD is lying: if we allocate the size of the workspace it says it + * requires, it returns memory allocation errors + */ + c->zstd_workspace_size = zstd_cctx_workspace_bound(¶ms.cParams); + + struct { + unsigned feature; + enum bch_compression_type type; + size_t compress_workspace; + size_t decompress_workspace; + } compression_types[] = { + { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), + 0 }, + { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, + zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize(), }, + { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, + c->zstd_workspace_size, + zstd_dctx_workspace_bound() }, + }, *i; + bool have_compressed = false; + + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); + i++) + have_compressed |= (features & (1 << i->feature)) != 0; + + if (!have_compressed) + return 0; + + if (!mempool_initialized(&c->compression_bounce[READ]) && + mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) + return -BCH_ERR_ENOMEM_compression_bounce_read_init; + + if (!mempool_initialized(&c->compression_bounce[WRITE]) && + mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) + return -BCH_ERR_ENOMEM_compression_bounce_write_init; + + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); + i++) { + decompress_workspace_size = + max(decompress_workspace_size, i->decompress_workspace); + + if (!(features & (1 << i->feature))) + continue; + + if (mempool_initialized(&c->compress_workspace[i->type])) + continue; + + if (mempool_init_kvpmalloc_pool( + &c->compress_workspace[i->type], + 1, i->compress_workspace)) + return -BCH_ERR_ENOMEM_compression_workspace_init; + } + + if (!mempool_initialized(&c->decompress_workspace) && + mempool_init_kvpmalloc_pool(&c->decompress_workspace, + 1, decompress_workspace_size)) + return -BCH_ERR_ENOMEM_decompression_workspace_init; + + return 0; +} + +static u64 compression_opt_to_feature(unsigned v) +{ + unsigned type = bch2_compression_decode(v).type; + + return BIT_ULL(bch2_compression_opt_to_feature[type]); +} + +int bch2_fs_compress_init(struct bch_fs *c) +{ + u64 f = c->sb.features; + + f |= compression_opt_to_feature(c->opts.compression); + f |= compression_opt_to_feature(c->opts.background_compression); + + return __bch2_fs_compress_init(c, f); +} + +int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, + struct printbuf *err) +{ + char *val = kstrdup(_val, GFP_KERNEL); + char *p = val, *type_str, *level_str; + struct bch_compression_opt opt = { 0 }; + int ret; + + if (!val) + return -ENOMEM; + + type_str = strsep(&p, ":"); + level_str = p; + + ret = match_string(bch2_compression_opts, -1, type_str); + if (ret < 0 && err) + prt_str(err, "invalid compression type"); + if (ret < 0) + goto err; + + opt.type = ret; + + if (level_str) { + unsigned level; + + ret = kstrtouint(level_str, 10, &level); + if (!ret && !opt.type && level) + ret = -EINVAL; + if (!ret && level > 15) + ret = -EINVAL; + if (ret < 0 && err) + prt_str(err, "invalid compression level"); + if (ret < 0) + goto err; + + opt.level = level; + } + + *res = bch2_compression_encode(opt); +err: + kfree(val); + return ret; +} + +void bch2_compression_opt_to_text(struct printbuf *out, u64 v) +{ + struct bch_compression_opt opt = bch2_compression_decode(v); + + if (opt.type < BCH_COMPRESSION_OPT_NR) + prt_str(out, bch2_compression_opts[opt.type]); + else + prt_printf(out, "(unknown compression opt %u)", opt.type); + if (opt.level) + prt_printf(out, ":%u", opt.level); +} + +void bch2_opt_compression_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + return bch2_compression_opt_to_text(out, v); +} + +int bch2_opt_compression_validate(u64 v, struct printbuf *err) +{ + if (!bch2_compression_opt_valid(v)) { + prt_printf(err, "invalid compression opt %llu", v); + return -BCH_ERR_invalid_sb_opt_compression; + } + + return 0; +} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h new file mode 100644 index 000000000000..607fd5e232c9 --- /dev/null +++ b/fs/bcachefs/compress.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COMPRESS_H +#define _BCACHEFS_COMPRESS_H + +#include "extents_types.h" + +static const unsigned __bch2_compression_opt_to_type[] = { +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, + BCH_COMPRESSION_OPTS() +#undef x +}; + +struct bch_compression_opt { + u8 type:4, + level:4; +}; + +static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) +{ + return (struct bch_compression_opt) { + .type = v & 15, + .level = v >> 4, + }; +} + +static inline bool bch2_compression_opt_valid(unsigned v) +{ + struct bch_compression_opt opt = __bch2_compression_decode(v); + + return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); +} + +static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +{ + return bch2_compression_opt_valid(v) + ? __bch2_compression_decode(v) + : (struct bch_compression_opt) { 0 }; +} + +static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) +{ + return opt.type|(opt.level << 4); +} + +static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) +{ + return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; +} + +int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, + struct bch_extent_crc_unpacked *); +int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, + struct bvec_iter, struct bch_extent_crc_unpacked); +unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, + struct bio *, size_t *, unsigned); + +int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); +void bch2_fs_compress_exit(struct bch_fs *); +int bch2_fs_compress_init(struct bch_fs *); + +void bch2_compression_opt_to_text(struct printbuf *, u64); + +int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); +int bch2_opt_compression_validate(u64, struct printbuf *); + +#define bch2_opt_compression (struct bch_opt_fn) { \ + .parse = bch2_opt_compression_parse, \ + .to_text = bch2_opt_compression_to_text, \ + .validate = bch2_opt_compression_validate, \ +} + +#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c new file mode 100644 index 000000000000..02a996e06a64 --- /dev/null +++ b/fs/bcachefs/counters.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "super-io.h" +#include "counters.h" + +/* BCH_SB_FIELD_counters */ + +static const char * const bch2_counter_names[] = { +#define x(t, n, ...) (#t), + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; + +static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) +{ + if (!ctrs) + return 0; + + return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; +}; + +static int bch2_sb_counters_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + return 0; +}; + +static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_counters *ctrs = field_to_type(f, counters); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + for (i = 0; i < nr; i++) { + if (i < BCH_COUNTER_NR) + prt_printf(out, "%s ", bch2_counter_names[i]); + else + prt_printf(out, "(unknown)"); + + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); + prt_newline(out); + } +}; + +int bch2_sb_counters_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + u64 val = 0; + + for (i = 0; i < BCH_COUNTER_NR; i++) + c->counters_on_mount[i] = 0; + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { + val = le64_to_cpu(ctrs->d[i]); + percpu_u64_set(&c->counters[i], val); + c->counters_on_mount[i] = val; + } + return 0; +}; + +int bch2_sb_counters_from_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); + struct bch_sb_field_counters *ret; + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + if (nr < BCH_COUNTER_NR) { + ret = bch2_sb_field_resize(&c->disk_sb, counters, + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); + + if (ret) { + ctrs = ret; + nr = bch2_sb_counter_nr_entries(ctrs); + } + } + + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) + ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + return 0; +} + +void bch2_fs_counters_exit(struct bch_fs *c) +{ + free_percpu(c->counters); +} + +int bch2_fs_counters_init(struct bch_fs *c) +{ + c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); + if (!c->counters) + return -BCH_ERR_ENOMEM_fs_counters_init; + + return bch2_sb_counters_to_cpu(c); +} + +const struct bch_sb_field_ops bch_sb_field_ops_counters = { + .validate = bch2_sb_counters_validate, + .to_text = bch2_sb_counters_to_text, +}; diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h new file mode 100644 index 000000000000..4778aa19bf34 --- /dev/null +++ b/fs/bcachefs/counters.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COUNTERS_H +#define _BCACHEFS_COUNTERS_H + +#include "bcachefs.h" +#include "super-io.h" + + +int bch2_sb_counters_to_cpu(struct bch_fs *); +int bch2_sb_counters_from_cpu(struct bch_fs *); + +void bch2_fs_counters_exit(struct bch_fs *); +int bch2_fs_counters_init(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_counters; + +#endif // _BCACHEFS_COUNTERS_H diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h new file mode 100644 index 000000000000..87b4b2d1ec76 --- /dev/null +++ b/fs/bcachefs/darray.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DARRAY_H +#define _BCACHEFS_DARRAY_H + +/* + * Dynamic arrays: + * + * Inspired by CCAN's darray + */ + +#include "util.h" +#include <linux/slab.h> + +#define DARRAY(type) \ +struct { \ + size_t nr, size; \ + type *data; \ +} + +typedef DARRAY(void) darray_void; + +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) +{ + if (d->nr + more > d->size) { + size_t new_size = roundup_pow_of_two(d->nr + more); + void *data = krealloc_array(d->data, new_size, t_size, gfp); + + if (!data) + return -ENOMEM; + + d->data = data; + d->size = new_size; + } + + return 0; +} + +#define darray_make_room_gfp(_d, _more, _gfp) \ + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + +#define darray_make_room(_d, _more) \ + darray_make_room_gfp(_d, _more, GFP_KERNEL) + +#define darray_top(_d) ((_d).data[(_d).nr]) + +#define darray_push_gfp(_d, _item, _gfp) \ +({ \ + int _ret = darray_make_room_gfp((_d), 1, _gfp); \ + \ + if (!_ret) \ + (_d)->data[(_d)->nr++] = (_item); \ + _ret; \ +}) + +#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) + +#define darray_pop(_d) ((_d)->data[--(_d)->nr]) + +#define darray_first(_d) ((_d).data[0]) +#define darray_last(_d) ((_d).data[(_d).nr - 1]) + +#define darray_insert_item(_d, pos, _item) \ +({ \ + size_t _pos = (pos); \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \ + _ret; \ +}) + +#define darray_remove_item(_d, _pos) \ + array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) + +#define darray_for_each(_d, _i) \ + for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + +#define darray_for_each_reverse(_d, _i) \ + for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + +#define darray_init(_d) \ +do { \ + (_d)->data = NULL; \ + (_d)->nr = (_d)->size = 0; \ +} while (0) + +#define darray_exit(_d) \ +do { \ + kfree((_d)->data); \ + darray_init(_d); \ +} while (0) + +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c new file mode 100644 index 000000000000..b05457d284a6 --- /dev/null +++ b/fs/bcachefs/data_update.c @@ -0,0 +1,652 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "data_update.h" +#include "ec.h" +#include "error.h" +#include "extents.h" +#include "io_write.h" +#include "keylist.h" +#include "move.h" +#include "nocow_locking.h" +#include "rebalance.h" +#include "subvolume.h" +#include "trace.h" + +static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_finish_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_finish(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void trace_move_extent_fail2(struct data_update *m, + struct bkey_s_c new, + struct bkey_s_c wrote, + struct bkey_i *insert, + const char *msg) +{ + struct bch_fs *c = m->op.c; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + const union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + unsigned i, rewrites_found = 0; + + if (!trace_move_extent_fail_enabled()) + return; + + prt_str(&buf, msg); + + if (insert) { + i = 0; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { + if (((1U << i) & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) + rewrites_found |= 1U << i; + i++; + } + } + + prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", + (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); + + prt_printf(&buf, "\nrewrites found: %u%u%u%u", + (rewrites_found & (1 << 0)) != 0, + (rewrites_found & (1 << 1)) != 0, + (rewrites_found & (1 << 2)) != 0, + (rewrites_found & (1 << 3)) != 0); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, new); + + prt_str(&buf, "\nwrote: "); + bch2_bkey_val_to_text(&buf, c, wrote); + + if (insert) { + prt_str(&buf, "\ninsert: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + } + + trace_move_extent_fail(c, buf.buf); + printbuf_exit(&buf); +} + +static int __bch2_data_update_index_update(struct btree_trans *trans, + struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_iter iter; + struct data_update *m = + container_of(op, struct data_update, op); + struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; + int ret = 0; + + bch2_bkey_buf_init(&_new); + bch2_bkey_buf_init(&_insert); + bch2_bkey_buf_realloc(&_insert, c, U8_MAX); + + bch2_trans_iter_init(trans, &iter, m->btree_id, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (1) { + struct bkey_s_c k; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + struct bkey_i *insert = NULL; + struct bkey_i_extent *new; + const union bch_extent_entry *entry_c; + union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_extent_ptr *ptr; + const struct bch_extent_ptr *ptr_c; + struct bpos next_pos; + bool should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + unsigned rewrites_found = 0, durability, i; + + bch2_trans_begin(trans); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + new = bkey_i_to_extent(bch2_keylist_front(keys)); + + if (!bch2_extents_match(k, old)) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), + NULL, "no match:"); + goto nowork; + } + + bkey_reassemble(_insert.k, k); + insert = _insert.k; + + bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); + new = bkey_i_to_extent(_new.k); + bch2_cut_front(iter.pos, &new->k_i); + + bch2_cut_front(iter.pos, insert); + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); + + /* + * @old: extent that we read from + * @insert: key that we're going to update, initialized from + * extent currently in btree - same as @old unless we raced with + * other updates + * @new: extent with new pointers that we'll be adding to @insert + * + * Fist, drop rewrite_ptrs from @new: + */ + i = 0; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { + if (((1U << i) & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) { + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + rewrites_found |= 1U << i; + } + i++; + } + + if (m->data_opts.rewrite_ptrs && + !rewrites_found && + bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + goto nowork; + } + + /* + * A replica that we just wrote might conflict with a replica + * that we want to keep, due to racing with another move: + */ +restart_drop_conflicting_replicas: + extent_for_each_ptr(extent_i_to_s(new), ptr) + if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && + !ptr_c->cached) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); + goto restart_drop_conflicting_replicas; + } + + if (!bkey_val_u64s(&new->k)) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + goto nowork; + } + + /* Now, drop pointers that conflict with what we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); + + durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + + bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); + + /* Now, drop excess replicas: */ +restart_drop_extra_replicas: + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; + + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + goto restart_drop_extra_replicas; + } + } + + /* Finally, add the pointers we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + bch2_extent_ptr_decoded_append(insert, &p); + + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + + ret = bch2_sum_sector_overwrites(trans, &iter, insert, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + goto err; + + if (disk_sectors_delta > (s64) op->res.sectors) { + ret = bch2_disk_reservation_add(c, &op->res, + disk_sectors_delta - op->res.sectors, + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto out; + } + + next_pos = insert->k.p; + + /* + * Check for nonce offset inconsistency: + * This is debug code - we've been seeing this bug rarely, and + * it's been hard to reproduce, so this should give us some more + * information when it does occur: + */ + struct printbuf err = PRINTBUF; + int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err); + printbuf_exit(&err); + + if (invalid) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + goto out; + } + + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, bkey_start_pos(&insert->k)) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, insert->k.p) ?: + bch2_bkey_set_needs_rebalance(c, insert, + op->opts.background_target, + op->opts.background_compression) ?: + bch2_trans_update(trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, &op->res, + NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); + trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); + } +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; +next: + while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { + bch2_keylist_pop_front(keys); + if (bch2_keylist_empty(keys)) + goto out; + } + continue; +nowork: + if (m->stats && m->stats) { + BUG_ON(k.k->p.offset <= iter.pos.offset); + atomic64_inc(&m->stats->keys_raced); + atomic64_add(k.k->p.offset - iter.pos.offset, + &m->stats->sectors_raced); + } + + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); + + bch2_btree_iter_advance(&iter); + goto next; + } +out: + bch2_trans_iter_exit(trans, &iter); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + return ret; +} + +int bch2_data_update_index_update(struct bch_write_op *op) +{ + return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); +} + +void bch2_data_update_read_done(struct data_update *m, + struct bch_extent_crc_unpacked crc) +{ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + + m->op.crc = crc; + m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + + closure_call(&m->op.cl, bch2_write, NULL, NULL); +} + +void bch2_data_update_exit(struct data_update *update) +{ + struct bch_fs *c = update->op.c; + struct bkey_ptrs_c ptrs = + bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) { + if (c->opts.nocow_enabled) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), 0); + percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); + } + + bch2_bkey_buf_exit(&update->k, c); + bch2_disk_reservation_put(c, &update->op.res); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); +} + +static void bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) +{ + struct bch_fs *c = update->op.c; + struct bio *bio = &update->op.wbio.bio; + struct bkey_i_extent *e; + struct write_point *wp; + struct bch_extent_ptr *ptr; + struct closure cl; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + closure_init_stack(&cl); + bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); + + while (bio_sectors(bio)) { + unsigned sectors = bio_sectors(bio); + + bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, + BTREE_ITER_SLOTS); + ret = lockrestart_do(trans, ({ + k = bch2_btree_iter_peek_slot(&iter); + bkey_err(k); + })); + bch2_trans_iter_exit(trans, &iter); + + if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) + break; + + e = bkey_extent_init(update->op.insert_keys.top); + e->k.p = update->op.pos; + + ret = bch2_alloc_sectors_start_trans(trans, + update->op.target, + false, + update->op.write_point, + &update->op.devs_have, + update->op.nr_replicas, + update->op.nr_replicas, + update->op.watermark, + 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { + bch2_trans_unlock(trans); + closure_sync(&cl); + continue; + } + + if (ret) + return; + + sectors = min(sectors, wp->sectors_free); + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &update->op.open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + bio_advance(bio, sectors << 9); + update->op.pos.offset += sectors; + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + bch2_keylist_push(&update->op.insert_keys); + + ret = __bch2_data_update_index_update(trans, &update->op); + + bch2_open_buckets_put(c, &update->op.open_buckets); + + if (ret) + break; + } + + if (closure_nr_remaining(&cl) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } +} + +int bch2_extent_drop_ptrs(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct data_update_opts data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + n = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + while (data_opts.kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); + data_opts.kill_ptrs ^= 1U << drop; + } + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k) && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + n->k.size = 0; + + return bch2_trans_relock(trans) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + +int bch2_data_update_init(struct btree_trans *trans, + struct btree_iter *iter, + struct moving_context *ctxt, + struct data_update *m, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + struct data_update_opts data_opts, + enum btree_id btree_id, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + const struct bch_extent_ptr *ptr; + unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned ptrs_locked = 0; + int ret = 0; + + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); + m->btree_id = btree_id; + m->data_opts = data_opts; + m->ctxt = ctxt; + m->stats = ctxt ? ctxt->stats : NULL; + + bch2_write_op_init(&m->op, c, io_opts); + m->op.pos = bkey_start_pos(k.k); + m->op.version = k.k->version; + m->op.target = data_opts.target; + m->op.write_point = wp; + m->op.nr_replicas = 0; + m->op.flags |= BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_MOVE| + m->data_opts.write_flags; + m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; + + bkey_for_each_ptr(ptrs, ptr) + percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); + + unsigned durability_have = 0, durability_removing = 0; + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + bool locked; + + if (((1U << i) & m->data_opts.rewrite_ptrs)) { + BUG_ON(p.ptr.cached); + + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + durability_removing += bch2_extent_ptr_desired_durability(c, &p); + } else if (!p.ptr.cached && + !((1U << i) & m->data_opts.kill_ptrs)) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); + } + + /* + * op->csum_type is normally initialized from the fs/file's + * current options - but if an extent is encrypted, we require + * that it stays encrypted: + */ + if (bch2_csum_type_is_encryption(p.crc.csum_type)) { + m->op.nonce = p.crc.nonce + p.crc.offset; + m->op.csum_type = p.crc.csum_type; + } + + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + m->op.incompressible = true; + + if (c->opts.nocow_enabled) { + if (ctxt) { + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) || + (!atomic_read(&ctxt->read_sectors) && + !atomic_read(&ctxt->write_sectors))); + + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto err; + } + } + ptrs_locked |= (1U << i); + } + + i++; + } + + /* + * If current extent durability is less than io_opts.data_replicas, + * we're not trying to rereplicate the extent up to data_replicas here - + * unless extra_replicas was specified + * + * Increasing replication is an explicit operation triggered by + * rereplicate, currently, so that users don't get an unexpected -ENOSPC + */ + if (durability_have >= io_opts.data_replicas) { + m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs; + m->data_opts.rewrite_ptrs = 0; + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts); + goto done; + } + + m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) + + m->data_opts.extra_replicas; + m->op.nr_replicas_required = m->op.nr_replicas; + + BUG_ON(!m->op.nr_replicas); + + if (reserve_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, + m->data_opts.extra_replicas + ? 0 + : BCH_DISK_RESERVATION_NOFAIL); + if (ret) + goto err; + } + + if (bkey_extent_is_unwritten(k)) { + bch2_update_unwritten_extent(trans, m); + goto done; + } + + return 0; +err: + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if ((1U << i) & ptrs_locked) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); + i++; + } + + bch2_bkey_buf_exit(&m->k, c); + bch2_bio_free_pages_pool(c, &m->op.wbio.bio); + return ret; +done: + bch2_data_update_exit(m); + return ret ?: -BCH_ERR_data_update_done; +} + +void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned i = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { + opts->kill_ptrs |= 1U << i; + opts->rewrite_ptrs ^= 1U << i; + } + + i++; + } +} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h new file mode 100644 index 000000000000..991095bbd469 --- /dev/null +++ b/fs/bcachefs/data_update.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BCACHEFS_DATA_UPDATE_H +#define _BCACHEFS_DATA_UPDATE_H + +#include "bkey_buf.h" +#include "io_write_types.h" + +struct moving_context; + +struct data_update_opts { + unsigned rewrite_ptrs; + unsigned kill_ptrs; + u16 target; + u8 extra_replicas; + unsigned btree_insert_flags; + unsigned write_flags; +}; + +struct data_update { + /* extent being updated: */ + enum btree_id btree_id; + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; + struct bch_move_stats *stats; + struct bch_write_op op; +}; + +int bch2_data_update_index_update(struct bch_write_op *); + +void bch2_data_update_read_done(struct data_update *, + struct bch_extent_crc_unpacked); + +int bch2_extent_drop_ptrs(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c, + struct data_update_opts); + +void bch2_data_update_exit(struct data_update *); +int bch2_data_update_init(struct btree_trans *, struct btree_iter *, + struct moving_context *, + struct data_update *, + struct write_point_specifier, + struct bch_io_opts, struct data_update_opts, + enum btree_id, struct bkey_s_c); +void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); + +#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c new file mode 100644 index 000000000000..57c5128db173 --- /dev/null +++ b/fs/bcachefs/debug.c @@ -0,0 +1,954 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Assorted bcachefs debug code + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "buckets.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "fsck.h" +#include "inode.h" +#include "super.h" + +#include <linux/console.h> +#include <linux/debugfs.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/seq_file.h> + +static struct dentry *bch_debug; + +static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, + struct extent_ptr_decoded pick) +{ + struct btree *v = c->verify_data; + struct btree_node *n_ondisk = c->verify_ondisk; + struct btree_node *n_sorted = c->verify_data->data; + struct bset *sorted, *inmemory = &b->data->keys; + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bio *bio; + bool failed = false, saw_error = false; + + if (!bch2_dev_get_ioref(ca, READ)) + return false; + + bio = bio_alloc_bioset(ca->disk_sb.bdev, + buf_pages(n_sorted, btree_bytes(c)), + REQ_OP_READ|REQ_META, + GFP_NOFS, + &c->btree_bio); + bio->bi_iter.bi_sector = pick.ptr.offset; + bch2_bio_map(bio, n_sorted, btree_bytes(c)); + + submit_bio_wait(bio); + + bio_put(bio); + percpu_ref_put(&ca->io_ref); + + memcpy(n_ondisk, n_sorted, btree_bytes(c)); + + v->written = 0; + if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) + return false; + + n_sorted = c->verify_data->data; + sorted = &n_sorted->keys; + + if (inmemory->u64s != sorted->u64s || + memcmp(inmemory->start, + sorted->start, + vstruct_end(inmemory) - (void *) inmemory->start)) { + unsigned offset = 0, sectors; + struct bset *i; + unsigned j; + + console_lock(); + + printk(KERN_ERR "*** in memory:\n"); + bch2_dump_bset(c, b, inmemory, 0); + + printk(KERN_ERR "*** read back in:\n"); + bch2_dump_bset(c, v, sorted, 0); + + while (offset < v->written) { + if (!offset) { + i = &n_ondisk->keys; + sectors = vstruct_blocks(n_ondisk, c->block_bits) << + c->block_bits; + } else { + struct btree_node_entry *bne = + (void *) n_ondisk + (offset << 9); + i = &bne->keys; + + sectors = vstruct_blocks(bne, c->block_bits) << + c->block_bits; + } + + printk(KERN_ERR "*** on disk block %u:\n", offset); + bch2_dump_bset(c, b, i, offset); + + offset += sectors; + } + + for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) + if (inmemory->_data[j] != sorted->_data[j]) + break; + + console_unlock(); + bch_err(c, "verify failed at key %u", j); + + failed = true; + } + + if (v->written != b->written) { + bch_err(c, "written wrong: expected %u, got %u", + b->written, v->written); + failed = true; + } + + return failed; +} + +void __bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ + struct bkey_ptrs_c ptrs; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + struct btree *v; + struct bset *inmemory = &b->data->keys; + struct bkey_packed *k; + bool failed = false; + + if (c->opts.nochanges) + return; + + bch2_btree_node_io_lock(b); + mutex_lock(&c->verify_lock); + + if (!c->verify_ondisk) { + c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + if (!c->verify_ondisk) + goto out; + } + + if (!c->verify_data) { + c->verify_data = __bch2_btree_node_mem_alloc(c); + if (!c->verify_data) + goto out; + + list_del_init(&c->verify_data->list); + } + + BUG_ON(b->nsets != 1); + + for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) + if (k->type == KEY_TYPE_btree_ptr_v2) + ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0; + + v = c->verify_data; + bkey_copy(&v->key, &b->key); + v->c.level = b->c.level; + v->c.btree_id = b->c.btree_id; + bch2_btree_keys_init(v); + + ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); + bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) + failed |= bch2_btree_verify_replica(c, b, p); + + if (failed) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); + printbuf_exit(&buf); + } +out: + mutex_unlock(&c->verify_lock); + bch2_btree_node_io_unlock(b); +} + +void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, + const struct btree *b) +{ + struct btree_node *n_ondisk = NULL; + struct extent_ptr_decoded pick; + struct bch_dev *ca; + struct bio *bio = NULL; + unsigned offset = 0; + int ret; + + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + prt_printf(out, "error getting device to read from: invalid device\n"); + return; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + if (!bch2_dev_get_ioref(ca, READ)) { + prt_printf(out, "error getting device to read from: not online\n"); + return; + } + + n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + if (!n_ondisk) { + prt_printf(out, "memory allocation failure\n"); + goto out; + } + + bio = bio_alloc_bioset(ca->disk_sb.bdev, + buf_pages(n_ondisk, btree_bytes(c)), + REQ_OP_READ|REQ_META, + GFP_NOFS, + &c->btree_bio); + bio->bi_iter.bi_sector = pick.ptr.offset; + bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + + ret = submit_bio_wait(bio); + if (ret) { + prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); + goto out; + } + + while (offset < btree_sectors(c)) { + struct bset *i; + struct nonce nonce; + struct bch_csum csum; + struct bkey_packed *k; + unsigned sectors; + + if (!offset) { + i = &n_ondisk->keys; + + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { + prt_printf(out, "unknown checksum type at offset %u: %llu\n", + offset, BSET_CSUM_TYPE(i)); + goto out; + } + + nonce = btree_nonce(i, offset << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); + + if (bch2_crc_cmp(csum, n_ondisk->csum)) { + prt_printf(out, "invalid checksum\n"); + goto out; + } + + bset_encrypt(c, i, offset << 9); + + sectors = vstruct_sectors(n_ondisk, c->block_bits); + } else { + struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); + + i = &bne->keys; + + if (i->seq != n_ondisk->keys.seq) + break; + + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { + prt_printf(out, "unknown checksum type at offset %u: %llu\n", + offset, BSET_CSUM_TYPE(i)); + goto out; + } + + nonce = btree_nonce(i, offset << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + if (bch2_crc_cmp(csum, bne->csum)) { + prt_printf(out, "invalid checksum"); + goto out; + } + + bset_encrypt(c, i, offset << 9); + + sectors = vstruct_sectors(bne, c->block_bits); + } + + prt_printf(out, " offset %u version %u, journal seq %llu\n", + offset, + le16_to_cpu(i->version), + le64_to_cpu(i->journal_seq)); + offset += sectors; + + printbuf_indent_add(out, 4); + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { + struct bkey u; + + bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); + prt_newline(out); + } + + printbuf_indent_sub(out, 4); + } +out: + if (bio) + bio_put(bio); + kvpfree(n_ondisk, btree_bytes(c)); + percpu_ref_put(&ca->io_ref); +} + +#ifdef CONFIG_DEBUG_FS + +/* XXX: bch_fs refcounting */ + +struct dump_iter { + struct bch_fs *c; + enum btree_id id; + struct bpos from; + struct bpos prev_node; + u64 iter; + + struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +static ssize_t flush_buf(struct dump_iter *i) +{ + if (i->buf.pos) { + size_t bytes = min_t(size_t, i->buf.pos, i->size); + int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes); + + i->ret += copied; + i->ubuf += copied; + i->size -= copied; + i->buf.pos -= copied; + memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); + + if (copied != bytes) + return -EFAULT; + } + + return i->size ? 0 : i->ret; +} + +static int bch2_dump_open(struct inode *inode, struct file *file) +{ + struct btree_debug *bd = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->from = POS_MIN; + i->iter = 0; + i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); + i->id = bd->id; + i->buf = PRINTBUF; + + return 0; +} + +static int bch2_dump_release(struct inode *inode, struct file *file) +{ + struct dump_iter *i = file->private_data; + + printbuf_exit(&i->buf); + kfree(i); + return 0; +} + +static ssize_t bch2_read_btree(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + ssize_t ret; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + ret = flush_buf(i); + if (ret) + return ret; + + trans = bch2_trans_get(i->c); + ret = for_each_btree_key2(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + bch2_bkey_val_to_text(&i->buf, i->c, k); + prt_newline(&i->buf); + drop_locks_do(trans, flush_buf(i)); + })); + i->from = iter.pos; + + bch2_trans_put(trans); + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_btree, +}; + +static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_trans *trans; + struct btree_iter iter; + struct btree *b; + ssize_t ret; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + ret = flush_buf(i); + if (ret) + return ret; + + if (bpos_eq(SPOS_MAX, i->from)) + return i->ret; + + trans = bch2_trans_get(i->c); +retry: + bch2_trans_begin(trans); + + for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) { + bch2_btree_node_to_text(&i->buf, i->c, b); + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; + + ret = drop_locks_do(trans, flush_buf(i)); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_format_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_btree_formats, +}; + +static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + ssize_t ret; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + ret = flush_buf(i); + if (ret) + return ret; + + trans = bch2_trans_get(i->c); + + ret = for_each_btree_key2(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct btree_path_level *l = &iter.path->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (bpos_gt(l->b->key.k.p, i->prev_node)) { + bch2_btree_node_to_text(&i->buf, i->c, l->b); + i->prev_node = l->b->key.k.p; + } + + bch2_bfloat_to_text(&i->buf, l->b, _k); + drop_locks_do(trans, flush_buf(i)); + })); + i->from = iter.pos; + + bch2_trans_put(trans); + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations bfloat_failed_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_bfloat_failed, +}; + +static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "%px btree=%s l=%u ", + b, + bch2_btree_id_str(b->c.btree_id), + b->c.level); + prt_newline(out); + + printbuf_indent_add(out, 2); + + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + prt_newline(out); + + prt_printf(out, "flags: "); + prt_tab(out); + prt_bitflags(out, bch2_btree_node_flags, b->flags); + prt_newline(out); + + prt_printf(out, "pcpu read locks: "); + prt_tab(out); + prt_printf(out, "%u", b->c.lock.readers != NULL); + prt_newline(out); + + prt_printf(out, "written:"); + prt_tab(out); + prt_printf(out, "%u", b->written); + prt_newline(out); + + prt_printf(out, "writes blocked:"); + prt_tab(out); + prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); + prt_newline(out); + + prt_printf(out, "will make reachable:"); + prt_tab(out); + prt_printf(out, "%lx", b->will_make_reachable); + prt_newline(out); + + prt_printf(out, "journal pin %px:", &b->writes[0].journal); + prt_tab(out); + prt_printf(out, "%llu", b->writes[0].journal.seq); + prt_newline(out); + + prt_printf(out, "journal pin %px:", &b->writes[1].journal); + prt_tab(out); + prt_printf(out, "%llu", b->writes[1].journal.seq); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + + ret = flush_buf(i); + if (ret) + return ret; + + rcu_read_lock(); + i->buf.atomic++; + tbl = rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++; + } else { + done = true; + } + --i->buf.atomic; + rcu_read_unlock(); + } while (!done); + + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations cached_btree_nodes_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_cached_btree_nodes_read, +}; + +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS +static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + ssize_t ret = 0; + u32 seq; + + i->ubuf = buf; + i->size = size; + i->ret = 0; +restart: + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid <= i->iter) + continue; + + closure_get(&trans->ref); + seq = seqmutex_seq(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); + + ret = flush_buf(i); + if (ret) { + closure_put(&trans->ref); + goto unlocked; + } + + bch2_btree_trans_to_text(&i->buf, trans); + + prt_printf(&i->buf, "backtrace:"); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + + i->iter = trans->locking_wait.task->pid; + + closure_put(&trans->ref); + + if (!seqmutex_relock(&c->btree_trans_lock, seq)) + goto restart; + } + seqmutex_unlock(&c->btree_trans_lock); +unlocked: + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_transactions_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_transactions_read, +}; +#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ + +static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + break; + + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); + i->iter++; + } while (!done); + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations journal_pins_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_journal_pins_read, +}; + +static int lock_held_stats_open(struct inode *inode, struct file *file) +{ + struct bch_fs *c = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + + if (!i) + return -ENOMEM; + + i->iter = 0; + i->c = c; + i->buf = PRINTBUF; + file->private_data = i; + + return 0; +} + +static int lock_held_stats_release(struct inode *inode, struct file *file) +{ + struct dump_iter *i = file->private_data; + + printbuf_exit(&i->buf); + kfree(i); + + return 0; +} + +static ssize_t lock_held_stats_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + while (1) { + struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; + + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + break; + + if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || + !bch2_btree_transaction_fns[i->iter]) + break; + + prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + + mutex_lock(&s->lock); + + prt_printf(&i->buf, "Max mem used: %u", s->max_mem); + prt_newline(&i->buf); + + if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { + prt_printf(&i->buf, "Lock hold times:"); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); + printbuf_indent_sub(&i->buf, 2); + } + + if (s->max_paths_text) { + prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + prt_str_indented(&i->buf, s->max_paths_text); + printbuf_indent_sub(&i->buf, 2); + } + + mutex_unlock(&s->lock); + + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + i->iter++; + } + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations lock_held_stats_op = { + .owner = THIS_MODULE, + .open = lock_held_stats_open, + .release = lock_held_stats_release, + .read = lock_held_stats_read, +}; + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + ssize_t ret = 0; + u32 seq; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (i->iter) + goto out; +restart: + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid <= i->iter) + continue; + + closure_get(&trans->ref); + seq = seqmutex_seq(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); + + ret = flush_buf(i); + if (ret) { + closure_put(&trans->ref); + goto out; + } + + bch2_check_for_deadlock(trans, &i->buf); + + i->iter = trans->locking_wait.task->pid; + + closure_put(&trans->ref); + + if (!seqmutex_relock(&c->btree_trans_lock, seq)) + goto restart; + } + seqmutex_unlock(&c->btree_trans_lock); +out: + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_deadlock_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_deadlock_read, +}; + +void bch2_fs_debug_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->fs_debug_dir)) + debugfs_remove_recursive(c->fs_debug_dir); +} + +void bch2_fs_debug_init(struct bch_fs *c) +{ + struct btree_debug *bd; + char name[100]; + + if (IS_ERR_OR_NULL(bch_debug)) + return; + + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); + if (IS_ERR_OR_NULL(c->fs_debug_dir)) + return; + + debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, + c->btree_debug, &cached_btree_nodes_ops); + +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS + debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, + c->btree_debug, &btree_transactions_ops); +#endif + + debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, + c->btree_debug, &journal_pins_ops); + + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, + c, &lock_held_stats_op); + + debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, + c->btree_debug, &btree_deadlock_ops); + + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); + if (IS_ERR_OR_NULL(c->btree_debug_dir)) + return; + + for (bd = c->btree_debug; + bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); + bd++) { + bd->id = bd - c->btree_debug; + debugfs_create_file(bch2_btree_id_str(bd->id), + 0400, c->btree_debug_dir, bd, + &btree_debug_ops); + + snprintf(name, sizeof(name), "%s-formats", + bch2_btree_id_str(bd->id)); + + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &btree_format_debug_ops); + + snprintf(name, sizeof(name), "%s-bfloat-failed", + bch2_btree_id_str(bd->id)); + + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &bfloat_failed_debug_ops); + } +} + +#endif + +void bch2_debug_exit(void) +{ + if (!IS_ERR_OR_NULL(bch_debug)) + debugfs_remove_recursive(bch_debug); +} + +int __init bch2_debug_init(void) +{ + int ret = 0; + + bch_debug = debugfs_create_dir("bcachefs", NULL); + return ret; +} diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h new file mode 100644 index 000000000000..2c37143b5fd1 --- /dev/null +++ b/fs/bcachefs/debug.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DEBUG_H +#define _BCACHEFS_DEBUG_H + +#include "bcachefs.h" + +struct bio; +struct btree; +struct bch_fs; + +void __bch2_btree_verify(struct bch_fs *, struct btree *); +void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, + const struct btree *); + +static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ + if (bch2_verify_btree_ondisk) + __bch2_btree_verify(c, b); +} + +#ifdef CONFIG_DEBUG_FS +void bch2_fs_debug_exit(struct bch_fs *); +void bch2_fs_debug_init(struct bch_fs *); +#else +static inline void bch2_fs_debug_exit(struct bch_fs *c) {} +static inline void bch2_fs_debug_init(struct bch_fs *c) {} +#endif + +void bch2_debug_exit(void); +int bch2_debug_init(void); + +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 index 000000000000..2bfff0da7000 --- /dev/null +++ b/fs/bcachefs/dirent.c @@ -0,0 +1,580 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "extents.h" +#include "dirent.h" +#include "fs.h" +#include "keylist.h" +#include "str_hash.h" +#include "subvolume.h" + +#include <linux/dcache.h> + +static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) +{ + unsigned bkey_u64s = bkey_val_u64s(d.k); + unsigned bkey_bytes = bkey_u64s * sizeof(u64); + u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; +#if CPU_BIG_ENDIAN + unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8; +#else + unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8; +#endif + + return bkey_bytes - + offsetof(struct bch_dirent, d_name) - + trailing_nuls; +} + +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) +{ + return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); +} + +static u64 bch2_dirent_hash(const struct bch_hash_info *info, + const struct qstr *name) +{ + struct bch_str_hash_ctx ctx; + + bch2_str_hash_init(&ctx, info); + bch2_str_hash_update(&ctx, info, name->name, name->len); + + /* [0,2) reserved for dots */ + return max_t(u64, bch2_str_hash_end(&ctx, info), 2); +} + +static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) +{ + return bch2_dirent_hash(info, key); +} + +static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr name = bch2_dirent_get_name(d); + + return bch2_dirent_hash(info, &name); +} + +static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); + const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr *r_name = _r; + + return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len); +} + +static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); + struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); + const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr r_name = bch2_dirent_get_name(r); + + return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len); +} + +static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + if (d.v->d_type == DT_SUBVOL) + return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; + return true; +} + +const struct bch_hash_desc bch2_dirent_hash_desc = { + .btree_id = BTREE_ID_dirents, + .key_type = KEY_TYPE_dirent, + .hash_key = dirent_hash_key, + .hash_bkey = dirent_hash_bkey, + .cmp_key = dirent_cmp_key, + .cmp_bkey = dirent_cmp_bkey, + .is_visible = dirent_is_visible, +}; + +int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr d_name = bch2_dirent_get_name(d); + int ret = 0; + + bkey_fsck_err_on(!d_name.len, c, err, + dirent_empty_name, + "empty name"); + + bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err, + dirent_val_too_big, + "value too big (%zu > %u)", + bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); + + /* + * Check new keys don't exceed the max length + * (older keys may be larger.) + */ + bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err, + dirent_name_too_long, + "dirent name too big (%u > %u)", + d_name.len, BCH_NAME_MAX); + + bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err, + dirent_name_embedded_nul, + "dirent has stray data after name's NUL"); + + bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) || + (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err, + dirent_name_dot_or_dotdot, + "invalid name"); + + bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err, + dirent_name_has_slash, + "name with /"); + + bkey_fsck_err_on(d.v->d_type != DT_SUBVOL && + le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err, + dirent_to_itself, + "dirent points to own directory"); +fsck_err: + return ret; +} + +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr d_name = bch2_dirent_get_name(d); + + prt_printf(out, "%.*s -> %llu type %s", + d_name.len, + d_name.name, + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), + bch2_d_type_str(d.v->d_type)); +} + +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + subvol_inum dir, u8 type, + const struct qstr *name, u64 dst) +{ + struct bkey_i_dirent *dirent; + unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); + + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + BUG_ON(u64s > U8_MAX); + + dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; + + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = u64s; + + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); + } + + dirent->v.d_type = type; + + memcpy(dirent->v.d_name, name->name, name->len); + memset(dirent->v.d_name + name->len, 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); + + EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); + + return dirent; +} + +int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *dir_offset, int flags) +{ + struct bkey_i_dirent *dirent; + int ret; + + dirent = dirent_create_key(trans, dir, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; + + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, + dir, &dirent->k_i, flags); + *dir_offset = dirent->k.p.offset; + + return ret; +} + +static void dirent_copy_target(struct bkey_i_dirent *dst, + struct bkey_s_c_dirent src) +{ + dst->v.d_inum = src.v->d_inum; + dst->v.d_type = src.v->d_type; +} + +int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + struct bkey_s_c_dirent d, subvol_inum *target) +{ + struct bch_subvolume s; + int ret = 0; + + if (d.v->d_type == DT_SUBVOL && + le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) + return 1; + + if (likely(d.v->d_type != DT_SUBVOL)) { + target->subvol = dir.subvol; + target->inum = le64_to_cpu(d.v->d_inum); + } else { + target->subvol = le32_to_cpu(d.v->d_child_subvol); + + ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + + target->inum = le64_to_cpu(s.inode); + } + + return ret; +} + +int bch2_dirent_rename(struct btree_trans *trans, + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) +{ + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; + struct bkey_s_c old_src, old_dst = bkey_s_c_null; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); + unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + int ret = 0; + + if (src_dir.subvol != dst_dir.subvol) + return -EXDEV; + + memset(src_inum, 0, sizeof(*src_inum)); + memset(dst_inum, 0, sizeof(*dst_inum)); + + /* Lookup src: */ + ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_INTENT); + if (ret) + goto out; + + old_src = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(old_src); + if (ret) + goto out; + + ret = bch2_dirent_read_target(trans, src_dir, + bkey_s_c_to_dirent(old_src), src_inum); + if (ret) + goto out; + + src_type = bkey_s_c_to_dirent(old_src).v->d_type; + + if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) + return -EOPNOTSUPP; + + + /* Lookup dst: */ + if (mode == BCH_RENAME) { + /* + * Note that we're _not_ checking if the target already exists - + * we're relying on the VFS to do that check for us for + * correctness: + */ + ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name); + if (ret) + goto out; + } else { + ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_INTENT); + if (ret) + goto out; + + old_dst = bch2_btree_iter_peek_slot(&dst_iter); + ret = bkey_err(old_dst); + if (ret) + goto out; + + ret = bch2_dirent_read_target(trans, dst_dir, + bkey_s_c_to_dirent(old_dst), dst_inum); + if (ret) + goto out; + + dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; + + if (dst_type == DT_SUBVOL) + return -EOPNOTSUPP; + } + + if (mode != BCH_RENAME_EXCHANGE) + *src_offset = dst_iter.pos.offset; + + /* Create new dst key: */ + new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); + ret = PTR_ERR_OR_ZERO(new_dst); + if (ret) + goto out; + + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + new_dst->k.p = dst_iter.pos; + + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { + new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; + + dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); + new_src->k.p = src_iter.pos; + } else { + new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; + + bkey_init(&new_src->k); + new_src->k.p = src_iter.pos; + + if (bkey_le(dst_pos, src_iter.pos) && + bkey_lt(src_iter.pos, dst_iter.pos)) { + /* + * We have a hash collision for the new dst key, + * and new_src - the key we're deleting - is between + * new_dst's hashed slot and the slot we're going to be + * inserting it into - oops. This will break the hash + * table if we don't deal with it: + */ + if (mode == BCH_RENAME) { + /* + * If we're not overwriting, we can just insert + * new_dst at the src position: + */ + new_src = new_dst; + new_src->k.p = src_iter.pos; + goto out_set_src; + } else { + /* If we're overwriting, we can't insert new_dst + * at a different slot because it has to + * overwrite old_dst - just make sure to use a + * whiteout when deleting src: + */ + new_src->k.type = KEY_TYPE_hash_whiteout; + } + } else { + /* Check if we need a whiteout to delete src: */ + ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, + src_hash, &src_iter); + if (ret < 0) + goto out; + + if (ret) + new_src->k.type = KEY_TYPE_hash_whiteout; + } + } + + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); + if (ret) + goto out; +out_set_src: + + /* + * If we're deleting a subvolume, we need to really delete the dirent, + * not just emit a whiteout in the current snapshot: + */ + if (src_type == DT_SUBVOL) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter); + if (ret) + goto out; + + new_src->k.p = src_iter.pos; + src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + } + + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); + if (ret) + goto out; + + if (mode == BCH_RENAME_EXCHANGE) + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; +out: + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; +} + +int __bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) +{ + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + + ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, inum); + if (ret > 0) + ret = -ENOENT; +err: + if (ret) + bch2_trans_iter_exit(trans, iter); + + return ret; +} + +u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + int ret; +retry: + bch2_trans_begin(trans); + + ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, + name, inum, 0); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (!ret) + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir, 0, snapshot), + POS(dir, U64_MAX), 0, k, ret) + if (k.k->type == KEY_TYPE_dirent) { + ret = -ENOTEMPTY; + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) +{ + u32 snapshot; + + return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: + bch2_empty_dir_snapshot(trans, dir.inum, snapshot); +} + +int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; + subvol_inum target; + u32 snapshot; + struct bkey_buf sk; + struct qstr name; + int ret; + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + SPOS(inum.inum, ctx->pos, snapshot), + POS(inum.inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_dirent) + continue; + + dirent = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, inum, dirent, &target); + if (ret < 0) + break; + if (ret) + continue; + + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + dirent = bkey_i_to_s_c_dirent(sk.k); + bch2_trans_unlock(trans); + + name = bch2_dirent_get_name(dirent); + + ctx->pos = dirent.k->p.offset; + if (!dir_emit(ctx, name.name, + name.len, + target.inum, + vfs_d_type(dirent.v->d_type))) + break; + ctx->pos = dirent.k->p.offset + 1; + + /* + * read_target looks up subvolumes, we can overflow paths if the + * directory has many subvolumes in it + */ + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h new file mode 100644 index 000000000000..1e3431990abd --- /dev/null +++ b/fs/bcachefs/dirent.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_H +#define _BCACHEFS_DIRENT_H + +#include "str_hash.h" + +enum bkey_invalid_flags; +extern const struct bch_hash_desc bch2_dirent_hash_desc; + +int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ + .key_invalid = bch2_dirent_invalid, \ + .val_to_text = bch2_dirent_to_text, \ + .min_val_size = 16, \ +}) + +struct qstr; +struct file; +struct dir_context; +struct bch_fs; +struct bch_hash_info; +struct bch_inode_info; + +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); + +static inline unsigned dirent_val_u64s(unsigned len) +{ + return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, + sizeof(u64)); +} + +int bch2_dirent_read_target(struct btree_trans *, subvol_inum, + struct bkey_s_c_dirent, subvol_inum *); + +int bch2_dirent_create(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, int); + +static inline unsigned vfs_d_type(unsigned type) +{ + return type == DT_SUBVOL ? DT_DIR : type; +} + +enum bch_rename_mode { + BCH_RENAME, + BCH_RENAME_OVERWRITE, + BCH_RENAME_EXCHANGE, +}; + +int bch2_dirent_rename(struct btree_trans *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, + enum bch_rename_mode); + +int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, + subvol_inum, const struct bch_hash_info *, + const struct qstr *, subvol_inum *, unsigned); +u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, + const struct bch_hash_info *, + const struct qstr *, subvol_inum *); + +int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); +int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); +int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); + +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c new file mode 100644 index 000000000000..4d0cb0ccff32 --- /dev/null +++ b/fs/bcachefs/disk_groups.c @@ -0,0 +1,622 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "disk_groups.h" +#include "sb-members.h" +#include "super-io.h" + +#include <linux/sort.h> + +static int group_cmp(const void *_l, const void *_r) +{ + const struct bch_disk_group *l = _l; + const struct bch_disk_group *r = _r; + + return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - + (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: + ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - + (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: + strncmp(l->label, r->label, sizeof(l->label)); +} + +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g, *sorted = NULL; + unsigned nr_groups = disk_groups_nr(groups); + unsigned i, len; + int ret = 0; + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member m = bch2_sb_member_get(sb, i); + unsigned group_id; + + if (!BCH_MEMBER_GROUP(&m)) + continue; + + group_id = BCH_MEMBER_GROUP(&m) - 1; + + if (group_id >= nr_groups) { + prt_printf(err, "disk %u has invalid label %u (have %u)", + i, group_id, nr_groups); + return -BCH_ERR_invalid_sb_disk_groups; + } + + if (BCH_GROUP_DELETED(&groups->entries[group_id])) { + prt_printf(err, "disk %u has deleted label %u", i, group_id); + return -BCH_ERR_invalid_sb_disk_groups; + } + } + + if (!nr_groups) + return 0; + + for (i = 0; i < nr_groups; i++) { + g = groups->entries + i; + + if (BCH_GROUP_DELETED(g)) + continue; + + len = strnlen(g->label, sizeof(g->label)); + if (!len) { + prt_printf(err, "label %u empty", i); + return -BCH_ERR_invalid_sb_disk_groups; + } + } + + sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); + if (!sorted) + return -BCH_ERR_ENOMEM_disk_groups_validate; + + memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); + sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); + + for (g = sorted; g + 1 < sorted + nr_groups; g++) + if (!BCH_GROUP_DELETED(g) && + !group_cmp(&g[0], &g[1])) { + prt_printf(err, "duplicate label %llu.%.*s", + BCH_GROUP_PARENT(g), + (int) sizeof(g->label), g->label); + ret = -BCH_ERR_invalid_sb_disk_groups; + goto err; + } +err: + kfree(sorted); + return ret; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_disk_groups_cpu *g; + struct bch_dev *ca; + int i; + unsigned iter; + + out->atomic++; + rcu_read_lock(); + + g = rcu_dereference(c->disk_groups); + if (!g) + goto out; + + for (i = 0; i < g->nr; i++) { + if (i) + prt_printf(out, " "); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + continue; + } + + prt_printf(out, "[parent %d devs", g->entries[i].parent); + for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); + prt_printf(out, "]"); + } + +out: + rcu_read_unlock(); + out->atomic--; +} + +static void bch2_sb_disk_groups_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g; + unsigned nr_groups = disk_groups_nr(groups); + + for (g = groups->entries; + g < groups->entries + nr_groups; + g++) { + if (g != groups->entries) + prt_printf(out, " "); + + if (BCH_GROUP_DELETED(g)) + prt_printf(out, "[deleted]"); + else + prt_printf(out, "[parent %llu name %s]", + BCH_GROUP_PARENT(g), g->label); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { + .validate = bch2_sb_disk_groups_validate, + .to_text = bch2_sb_disk_groups_to_text +}; + +int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_disk_groups *groups; + struct bch_disk_groups_cpu *cpu_g, *old_g; + unsigned i, g, nr_groups; + + lockdep_assert_held(&c->sb_lock); + + groups = bch2_sb_field_get(c->disk_sb.sb, disk_groups); + nr_groups = disk_groups_nr(groups); + + if (!groups) + return 0; + + cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); + if (!cpu_g) + return -BCH_ERR_ENOMEM_disk_groups_to_cpu; + + cpu_g->nr = nr_groups; + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *src = &groups->entries[i]; + struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; + + dst->deleted = BCH_GROUP_DELETED(src); + dst->parent = BCH_GROUP_PARENT(src); + memcpy(dst->label, src->label, sizeof(dst->label)); + } + + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i); + struct bch_disk_group_cpu *dst; + + if (!bch2_member_exists(&m)) + continue; + + g = BCH_MEMBER_GROUP(&m); + while (g) { + dst = &cpu_g->entries[g - 1]; + __set_bit(i, dst->devs.d); + g = dst->parent; + } + } + + old_g = rcu_dereference_protected(c->disk_groups, + lockdep_is_held(&c->sb_lock)); + rcu_assign_pointer(c->disk_groups, cpu_g); + if (old_g) + kfree_rcu(old_g, rcu); + + return 0; +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) +{ + struct target t = target_decode(target); + struct bch_devs_mask *devs; + + rcu_read_lock(); + + switch (t.type) { + case TARGET_NULL: + devs = NULL; + break; + case TARGET_DEV: { + struct bch_dev *ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + devs = ca ? &ca->self : NULL; + break; + } + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + devs = g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + break; + } + default: + BUG(); + } + + rcu_read_unlock(); + + return devs; +} + +bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) +{ + struct target t = target_decode(target); + + switch (t.type) { + case TARGET_NULL: + return false; + case TARGET_DEV: + return dev == t.dev; + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g; + const struct bch_devs_mask *m; + bool ret; + + rcu_read_lock(); + g = rcu_dereference(c->disk_groups); + m = g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + + ret = m ? test_bit(dev, m->d) : false; + rcu_read_unlock(); + + return ret; + } + default: + BUG(); + } +} + +static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, + unsigned parent, + const char *name, unsigned namelen) +{ + unsigned i, nr_groups = disk_groups_nr(groups); + + if (!namelen || namelen > BCH_SB_LABEL_SIZE) + return -EINVAL; + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *g = groups->entries + i; + + if (BCH_GROUP_DELETED(g)) + continue; + + if (!BCH_GROUP_DELETED(g) && + BCH_GROUP_PARENT(g) == parent && + strnlen(g->label, sizeof(g->label)) == namelen && + !memcmp(name, g->label, namelen)) + return i; + } + + return -1; +} + +static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, + const char *name, unsigned namelen) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_field_get(sb->sb, disk_groups); + unsigned i, nr_groups = disk_groups_nr(groups); + struct bch_disk_group *g; + + if (!namelen || namelen > BCH_SB_LABEL_SIZE) + return -EINVAL; + + for (i = 0; + i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); + i++) + ; + + if (i == nr_groups) { + unsigned u64s = + (sizeof(struct bch_sb_field_disk_groups) + + sizeof(struct bch_disk_group) * (nr_groups + 1)) / + sizeof(u64); + + groups = bch2_sb_field_resize(sb, disk_groups, u64s); + if (!groups) + return -BCH_ERR_ENOSPC_disk_label_add; + + nr_groups = disk_groups_nr(groups); + } + + BUG_ON(i >= nr_groups); + + g = &groups->entries[i]; + + memcpy(g->label, name, namelen); + if (namelen < sizeof(g->label)) + g->label[namelen] = '\0'; + SET_BCH_GROUP_DELETED(g, 0); + SET_BCH_GROUP_PARENT(g, parent); + SET_BCH_GROUP_DATA_ALLOWED(g, ~0); + + return i; +} + +int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_field_get(sb->sb, disk_groups); + int v = -1; + + do { + const char *next = strchrnul(name, '.'); + unsigned len = next - name; + + if (*next == '.') + next++; + + v = __bch2_disk_group_find(groups, v + 1, name, len); + name = next; + } while (*name && v >= 0); + + return v; +} + +int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) +{ + struct bch_sb_field_disk_groups *groups; + unsigned parent = 0; + int v = -1; + + do { + const char *next = strchrnul(name, '.'); + unsigned len = next - name; + + if (*next == '.') + next++; + + groups = bch2_sb_field_get(sb->sb, disk_groups); + + v = __bch2_disk_group_find(groups, parent, name, len); + if (v < 0) + v = __bch2_disk_group_add(sb, parent, name, len); + if (v < 0) + return v; + + parent = v + 1; + name = next; + } while (*name && v >= 0); + + return v; +} + +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + struct bch_disk_groups_cpu *groups; + struct bch_disk_group_cpu *g; + unsigned nr = 0; + u16 path[32]; + + out->atomic++; + rcu_read_lock(); + groups = rcu_dereference(c->disk_groups); + if (!groups) + goto invalid; + + while (1) { + if (nr == ARRAY_SIZE(path)) + goto invalid; + + if (v >= groups->nr) + goto invalid; + + g = groups->entries + v; + + if (g->deleted) + goto invalid; + + path[nr++] = v; + + if (!g->parent) + break; + + v = g->parent - 1; + } + + while (nr) { + v = path[--nr]; + g = groups->entries + v; + + prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + if (nr) + prt_printf(out, "."); + } +out: + rcu_read_unlock(); + out->atomic--; + return; +invalid: + prt_printf(out, "invalid label %u", v); + goto out; +} + +void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_field_get(sb, disk_groups); + struct bch_disk_group *g; + unsigned nr = 0; + u16 path[32]; + + while (1) { + if (nr == ARRAY_SIZE(path)) + goto inval; + + if (v >= disk_groups_nr(groups)) + goto inval; + + g = groups->entries + v; + + if (BCH_GROUP_DELETED(g)) + goto inval; + + path[nr++] = v; + + if (!BCH_GROUP_PARENT(g)) + break; + + v = BCH_GROUP_PARENT(g) - 1; + } + + while (nr) { + v = path[--nr]; + g = groups->entries + v; + + prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + if (nr) + prt_printf(out, "."); + } + return; +inval: + prt_printf(out, "invalid label %u", v); +} + +int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +{ + struct bch_member *mi; + int ret, v = -1; + + if (!strlen(name) || !strcmp(name, "none")) + return 0; + + v = bch2_disk_path_find_or_create(&c->disk_sb, name); + if (v < 0) + return v; + + ret = bch2_sb_disk_groups_to_cpu(c); + if (ret) + return ret; + + mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_GROUP(mi, v + 1); + return 0; +} + +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +{ + int ret; + + mutex_lock(&c->sb_lock); + ret = __bch2_dev_group_set(c, ca, name) ?: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, + struct printbuf *err) +{ + struct bch_dev *ca; + int g; + + if (!val) + return -EINVAL; + + if (!c) + return 0; + + if (!strlen(val) || !strcmp(val, "none")) { + *res = 0; + return 0; + } + + /* Is it a device? */ + ca = bch2_dev_lookup(c, val); + if (!IS_ERR(ca)) { + *res = dev_to_target(ca->dev_idx); + percpu_ref_put(&ca->ref); + return 0; + } + + mutex_lock(&c->sb_lock); + g = bch2_disk_path_find(&c->disk_sb, val); + mutex_unlock(&c->sb_lock); + + if (g >= 0) { + *res = group_to_target(g); + return 0; + } + + return -EINVAL; +} + +void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + struct target t = target_decode(v); + + switch (t.type) { + case TARGET_NULL: + prt_printf(out, "none"); + break; + case TARGET_DEV: { + struct bch_dev *ca; + + out->atomic++; + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + prt_printf(out, "offline device %u", t.dev); + } else { + prt_printf(out, "invalid device %u", t.dev); + } + + rcu_read_unlock(); + out->atomic--; + break; + } + case TARGET_GROUP: + bch2_disk_path_to_text(out, c, t.group); + break; + default: + BUG(); + } +} + +static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +{ + struct target t = target_decode(v); + + switch (t.type) { + case TARGET_NULL: + prt_printf(out, "none"); + break; + case TARGET_DEV: { + struct bch_member m = bch2_sb_member_get(sb, t.dev); + + if (bch2_dev_exists(sb, t.dev)) { + prt_printf(out, "Device "); + pr_uuid(out, m.uuid.b); + prt_printf(out, " (%u)", t.dev); + } else { + prt_printf(out, "Bad device %u", t.dev); + } + break; + } + case TARGET_GROUP: + bch2_disk_path_to_text_sb(out, sb, t.group); + break; + default: + BUG(); + } +} + +void bch2_opt_target_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + if (c) + bch2_target_to_text(out, c, v); + else + bch2_target_to_text_sb(out, sb, v); +} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h new file mode 100644 index 000000000000..441826fff224 --- /dev/null +++ b/fs/bcachefs/disk_groups.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_H +#define _BCACHEFS_DISK_GROUPS_H + +#include "disk_groups_types.h" + +extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; + +static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) +{ + return groups + ? (vstruct_end(&groups->field) - + (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) + : 0; +} + +struct target { + enum { + TARGET_NULL, + TARGET_DEV, + TARGET_GROUP, + } type; + union { + unsigned dev; + unsigned group; + }; +}; + +#define TARGET_DEV_START 1 +#define TARGET_GROUP_START (256 + TARGET_DEV_START) + +static inline u16 dev_to_target(unsigned dev) +{ + return TARGET_DEV_START + dev; +} + +static inline u16 group_to_target(unsigned group) +{ + return TARGET_GROUP_START + group; +} + +static inline struct target target_decode(unsigned target) +{ + if (target >= TARGET_GROUP_START) + return (struct target) { + .type = TARGET_GROUP, + .group = target - TARGET_GROUP_START + }; + + if (target >= TARGET_DEV_START) + return (struct target) { + .type = TARGET_DEV, + .group = target - TARGET_DEV_START + }; + + return (struct target) { .type = TARGET_NULL }; +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); + +static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask devs = c->rw_devs[data_type]; + const struct bch_devs_mask *t = bch2_target_to_mask(c, target); + + if (t) + bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + return devs; +} + +static inline bool bch2_target_accepts_data(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); + return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); +} + +bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); + +int bch2_disk_path_find(struct bch_sb_handle *, const char *); + +/* Exported for userspace bcachefs-tools: */ +int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + +void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned); +void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned); + +void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned); + +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + +#define bch2_opt_target (struct bch_opt_fn) { \ + .parse = bch2_opt_target_parse, \ + .to_text = bch2_opt_target_to_text, \ +} + +int bch2_sb_disk_groups_to_cpu(struct bch_fs *); + +int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); +int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); + +const char *bch2_sb_validate_disk_groups(struct bch_sb *, + struct bch_sb_field *); + +void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/disk_groups_types.h b/fs/bcachefs/disk_groups_types.h new file mode 100644 index 000000000000..a54ef085b13d --- /dev/null +++ b/fs/bcachefs/disk_groups_types.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H +#define _BCACHEFS_DISK_GROUPS_TYPES_H + +struct bch_disk_group_cpu { + bool deleted; + u16 parent; + u8 label[BCH_SB_LABEL_SIZE]; + struct bch_devs_mask devs; +}; + +struct bch_disk_groups_cpu { + struct rcu_head rcu; + unsigned nr; + struct bch_disk_group_cpu entries[] __counted_by(nr); +}; + +#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 index 000000000000..2a77de18c004 --- /dev/null +++ b/fs/bcachefs/ec.c @@ -0,0 +1,1981 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "bkey_buf.h" +#include "bset.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "checksum.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io_read.h" +#include "keylist.h" +#include "recovery.h" +#include "replicas.h" +#include "super-io.h" +#include "util.h" + +#include <linux/sort.h> + +#ifdef __KERNEL__ + +#include <linux/raid/pq.h> +#include <linux/raid/xor.h> + +static void raid5_recov(unsigned disks, unsigned failed_idx, + size_t size, void **data) +{ + unsigned i = 2, nr; + + BUG_ON(failed_idx >= disks); + + swap(data[0], data[failed_idx]); + memcpy(data[0], data[1], size); + + while (i < disks) { + nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); + xor_blocks(nr, size, data[0], data + i); + i += nr; + } + + swap(data[0], data[failed_idx]); +} + +static void raid_gen(int nd, int np, size_t size, void **v) +{ + if (np >= 1) + raid5_recov(nd + np, nd, size, v); + if (np >= 2) + raid6_call.gen_syndrome(nd + np, size, v); + BUG_ON(np > 2); +} + +static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) +{ + switch (nr) { + case 0: + break; + case 1: + if (ir[0] < nd + 1) + raid5_recov(nd + 1, ir[0], size, v); + else + raid6_call.gen_syndrome(nd + np, size, v); + break; + case 2: + if (ir[1] < nd) { + /* data+data failure. */ + raid6_2data_recov(nd + np, size, ir[0], ir[1], v); + } else if (ir[0] < nd) { + /* data + p/q failure */ + + if (ir[1] == nd) /* data + p failure */ + raid6_datap_recov(nd + np, size, ir[0], v); + else { /* data + q failure */ + raid5_recov(nd + 1, ir[0], size, v); + raid6_call.gen_syndrome(nd + np, size, v); + } + } else { + raid_gen(nd, np, size, v); + } + break; + default: + BUG(); + } +} + +#else + +#include <raid/raid.h> + +#endif + +struct ec_bio { + struct bch_dev *ca; + struct ec_stripe_buf *buf; + size_t idx; + struct bio bio; +}; + +/* Stripes btree keys: */ + +int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + int ret = 0; + + bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) || + bpos_gt(k.k->p, POS(0, U32_MAX)), c, err, + stripe_pos_bad, + "stripe at bad pos"); + + bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err, + stripe_val_size_bad, + "incorrect value size (%zu < %u)", + bkey_val_u64s(k.k), stripe_val_u64s(s)); + + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; +} + +void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + + prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + s->algorithm, + le16_to_cpu(s->sectors), + nr_data, + s->nr_redundant, + s->csum_type, + 1U << s->csum_granularity_bits); + + for (i = 0; i < s->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = s->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); + if (i < nr_data) + prt_printf(out, "#%u", stripe_blockcount_get(s, i)); + prt_printf(out, " gen %u", ptr->gen); + if (ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } +} + +/* returns blocknr in stripe that we matched: */ +static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, + struct bkey_s_c k, unsigned *block) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + + bkey_for_each_ptr(ptrs, ptr) + for (i = 0; i < nr_data; i++) + if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, + le16_to_cpu(s->sectors))) { + *block = i; + return ptr; + } + + return NULL; +} + +static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) +{ + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + + extent_for_each_entry(e, entry) + if (extent_entry_type(entry) == + BCH_EXTENT_ENTRY_stripe_ptr && + entry->stripe_ptr.idx == idx) + return true; + + break; + } + } + + return false; +} + +/* Stripe bufs: */ + +static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) +{ + if (buf->key.k.type == KEY_TYPE_stripe) { + struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); + unsigned i; + + for (i = 0; i < s->v.nr_blocks; i++) { + kvpfree(buf->data[i], buf->size << 9); + buf->data[i] = NULL; + } + } +} + +/* XXX: this is a non-mempoolified memory allocation: */ +static int ec_stripe_buf_init(struct ec_stripe_buf *buf, + unsigned offset, unsigned size) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned csum_granularity = 1U << v->csum_granularity_bits; + unsigned end = offset + size; + unsigned i; + + BUG_ON(end > le16_to_cpu(v->sectors)); + + offset = round_down(offset, csum_granularity); + end = min_t(unsigned, le16_to_cpu(v->sectors), + round_up(end, csum_granularity)); + + buf->offset = offset; + buf->size = end - offset; + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + + for (i = 0; i < v->nr_blocks; i++) { + buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + if (!buf->data[i]) + goto err; + } + + return 0; +err: + ec_stripe_buf_exit(buf); + return -BCH_ERR_ENOMEM_stripe_buf; +} + +/* Checksumming: */ + +static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, + unsigned block, unsigned offset) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned end = buf->offset + buf->size; + unsigned len = min(csum_granularity, end - offset); + + BUG_ON(offset >= end); + BUG_ON(offset < buf->offset); + BUG_ON(offset & (csum_granularity - 1)); + BUG_ON(offset + len != le16_to_cpu(v->sectors) && + (len & (csum_granularity - 1))); + + return bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[block] + ((offset - buf->offset) << 9), + len << 9); +} + +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned i, j, csums_per_device = stripe_csums_per_device(v); + + if (!v->csum_type) + return; + + BUG_ON(buf->offset); + BUG_ON(buf->size != le16_to_cpu(v->sectors)); + + for (i = 0; i < v->nr_blocks; i++) + for (j = 0; j < csums_per_device; j++) + stripe_csum_set(v, i, j, + ec_block_checksum(buf, i, j << v->csum_granularity_bits)); +} + +static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned i; + + if (!v->csum_type) + return; + + for (i = 0; i < v->nr_blocks; i++) { + unsigned offset = buf->offset; + unsigned end = buf->offset + buf->size; + + if (!test_bit(i, buf->valid)) + continue; + + while (offset < end) { + unsigned j = offset >> v->csum_granularity_bits; + unsigned len = min(csum_granularity, end - offset); + struct bch_csum want = stripe_csum_get(v, i, j); + struct bch_csum got = ec_block_checksum(buf, i, offset); + + if (bch2_crc_cmp(want, got)) { + struct printbuf err = PRINTBUF; + struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev); + + prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n", + want.hi, want.lo, + got.hi, got.lo, + bch2_csum_types[v->csum_type]); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); + printbuf_exit(&err); + + clear_bit(i, buf->valid); + + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + break; + } + + offset += len; + } + } +} + +/* Erasure coding: */ + +static void ec_generate_ec(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = le16_to_cpu(v->sectors) << 9; + + raid_gen(nr_data, v->nr_redundant, bytes, buf->data); +} + +static unsigned ec_nr_failed(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + + return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); +} + +static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = buf->size << 9; + + if (ec_nr_failed(buf) > v->nr_redundant) { + bch_err_ratelimited(c, + "error doing reconstruct read: unable to read enough blocks"); + return -1; + } + + for (i = 0; i < nr_data; i++) + if (!test_bit(i, buf->valid)) + failed[nr_failed++] = i; + + raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); + return 0; +} + +/* IO: */ + +static void ec_block_endio(struct bio *bio) +{ + struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; + struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + + if (bch2_dev_io_err_on(bio->bi_status, ca, + bio_data_dir(bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "erasure coding %s error: %s", + bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(ca->fs, + "error %s stripe: stale pointer after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to"); + clear_bit(ec_bio->idx, ec_bio->buf->valid); + } + + bio_put(&ec_bio->bio); + percpu_ref_put(&ca->io_ref); + closure_put(cl); +} + +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + blk_opf_t opf, unsigned idx, struct closure *cl) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned offset = 0, bytes = buf->size << 9; + struct bch_extent_ptr *ptr = &v->ptrs[idx]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant + ? BCH_DATA_user + : BCH_DATA_parity; + int rw = op_is_write(opf); + + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer", + rw == READ ? "reading from" : "writing to"); + clear_bit(idx, buf->valid); + return; + } + + if (!bch2_dev_get_ioref(ca, rw)) { + clear_bit(idx, buf->valid); + return; + } + + this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); + + while (offset < bytes) { + unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, + DIV_ROUND_UP(bytes, PAGE_SIZE)); + unsigned b = min_t(size_t, bytes - offset, + nr_iovecs << PAGE_SHIFT); + struct ec_bio *ec_bio; + + ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, + nr_iovecs, + opf, + GFP_KERNEL, + &c->ec_bioset), + struct ec_bio, bio); + + ec_bio->ca = ca; + ec_bio->buf = buf; + ec_bio->idx = idx; + + ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); + ec_bio->bio.bi_end_io = ec_block_endio; + ec_bio->bio.bi_private = cl; + + bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); + + closure_get(cl); + percpu_ref_get(&ca->io_ref); + + submit_bio(&ec_bio->bio); + + offset += b; + } + + percpu_ref_put(&ca->io_ref); +} + +static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, + struct ec_stripe_buf *stripe) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + POS(0, idx), BTREE_ITER_SLOTS); + ret = bkey_err(k); + if (ret) + goto err; + if (k.k->type != KEY_TYPE_stripe) { + ret = -ENOENT; + goto err; + } + bkey_reassemble(&stripe->key, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* recovery read path: */ +int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) +{ + struct bch_fs *c = trans->c; + struct ec_stripe_buf *buf; + struct closure cl; + struct bch_stripe *v; + unsigned i, offset; + int ret = 0; + + closure_init_stack(&cl); + + BUG_ON(!rbio->pick.has_ec); + + buf = kzalloc(sizeof(*buf), GFP_NOFS); + if (!buf) + return -BCH_ERR_ENOMEM_ec_read_extent; + + ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); + if (ret) { + bch_err_ratelimited(c, + "error doing reconstruct read: error %i looking up stripe", ret); + kfree(buf); + return -EIO; + } + + v = &bkey_i_to_stripe(&buf->key)->v; + + if (!bch2_ptr_matches_stripe(v, rbio->pick)) { + bch_err_ratelimited(c, + "error doing reconstruct read: pointer doesn't match stripe"); + ret = -EIO; + goto err; + } + + offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; + if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { + bch_err_ratelimited(c, + "error doing reconstruct read: read is bigger than stripe"); + ret = -EIO; + goto err; + } + + ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + if (ret) + goto err; + + for (i = 0; i < v->nr_blocks; i++) + ec_block_io(c, buf, REQ_OP_READ, i, &cl); + + closure_sync(&cl); + + if (ec_nr_failed(buf) > v->nr_redundant) { + bch_err_ratelimited(c, + "error doing reconstruct read: unable to read enough blocks"); + ret = -EIO; + goto err; + } + + ec_validate_checksums(c, buf); + + ret = ec_do_recov(c, buf); + if (ret) + goto err; + + memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, + buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); +err: + ec_stripe_buf_exit(buf); + kfree(buf); + return ret; +} + +/* stripe bucket accounting: */ + +static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) +{ + ec_stripes_heap n, *h = &c->ec_stripes_heap; + + if (idx >= h->size) { + if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + + mutex_lock(&c->ec_stripes_heap_lock); + if (n.size > h->size) { + memcpy(n.data, h->data, h->used * sizeof(h->data[0])); + n.used = h->used; + swap(*h, n); + } + mutex_unlock(&c->ec_stripes_heap_lock); + + free_heap(&n); + } + + if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + + if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && + !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + + return 0; +} + +static int ec_stripe_mem_alloc(struct btree_trans *trans, + struct btree_iter *iter) +{ + return allocate_dropping_locks_errcode(trans, + __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); +} + +/* + * Hash table of open stripes: + * Stripes that are being created or modified are kept in a hash table, so that + * stripe deletion can skip them. + */ + +static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) +{ + unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); + struct ec_stripe_new *s; + + hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) + if (s->idx == idx) + return true; + return false; +} + +static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) +{ + bool ret = false; + + spin_lock(&c->ec_stripes_new_lock); + ret = __bch2_stripe_is_open(c, idx); + spin_unlock(&c->ec_stripes_new_lock); + + return ret; +} + +static bool bch2_try_open_stripe(struct bch_fs *c, + struct ec_stripe_new *s, + u64 idx) +{ + bool ret; + + spin_lock(&c->ec_stripes_new_lock); + ret = !__bch2_stripe_is_open(c, idx); + if (ret) { + unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); + + s->idx = idx; + hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); + } + spin_unlock(&c->ec_stripes_new_lock); + + return ret; +} + +static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) +{ + BUG_ON(!s->idx); + + spin_lock(&c->ec_stripes_new_lock); + hlist_del_init(&s->hash); + spin_unlock(&c->ec_stripes_new_lock); + + s->idx = 0; +} + +/* Heap of all existing stripes, ordered by blocks_nonempty */ + +static u64 stripe_idx_to_delete(struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + + lockdep_assert_held(&c->ec_stripes_heap_lock); + + if (h->used && + h->data[0].blocks_nonempty == 0 && + !bch2_stripe_is_open(c, h->data[0].idx)) + return h->data[0].idx; + + return 0; +} + +static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, + struct ec_stripe_heap_entry l, + struct ec_stripe_heap_entry r) +{ + return ((l.blocks_nonempty > r.blocks_nonempty) - + (l.blocks_nonempty < r.blocks_nonempty)); +} + +static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, + size_t i) +{ + struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); + + genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; +} + +static void heap_verify_backpointer(struct bch_fs *c, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m = genradix_ptr(&c->stripes, idx); + + BUG_ON(m->heap_idx >= h->used); + BUG_ON(h->data[m->heap_idx].idx != idx); +} + +void bch2_stripes_heap_del(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + mutex_lock(&c->ec_stripes_heap_lock); + heap_verify_backpointer(c, idx); + + heap_del(&c->ec_stripes_heap, m->heap_idx, + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + mutex_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_stripes_heap_insert(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + mutex_lock(&c->ec_stripes_heap_lock); + BUG_ON(heap_full(&c->ec_stripes_heap)); + + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + .idx = idx, + .blocks_nonempty = m->blocks_nonempty, + }), + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + mutex_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_stripes_heap_update(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + bool do_deletes; + size_t i; + + mutex_lock(&c->ec_stripes_heap_lock); + heap_verify_backpointer(c, idx); + + h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; + + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + + do_deletes = stripe_idx_to_delete(c) != 0; + mutex_unlock(&c->ec_stripes_heap_lock); + + if (do_deletes) + bch2_do_stripe_deletes(c); +} + +/* stripe deletion */ + +static int ec_stripe_delete(struct btree_trans *trans, u64 idx) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_stripe s; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_stripe) { + bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); + ret = -EINVAL; + goto err; + } + + s = bkey_s_c_to_stripe(k); + for (unsigned i = 0; i < s.v->nr_blocks; i++) + if (stripe_blockcount_get(s.v, i)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void ec_stripe_delete_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, ec_stripe_delete_work); + struct btree_trans *trans = bch2_trans_get(c); + int ret; + u64 idx; + + while (1) { + mutex_lock(&c->ec_stripes_heap_lock); + idx = stripe_idx_to_delete(c); + mutex_unlock(&c->ec_stripes_heap_lock); + + if (!idx) + break; + + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ec_stripe_delete(trans, idx)); + if (ret) { + bch_err_fn(c, ret); + break; + } + } + + bch2_trans_put(trans); + + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); +} + +void bch2_do_stripe_deletes(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); +} + +/* stripe creation: */ + +static int ec_stripe_key_update(struct btree_trans *trans, + struct bkey_i_stripe *new, + bool create) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { + bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type]); + ret = -EINVAL; + goto err; + } + + if (k.k->type == KEY_TYPE_stripe) { + const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; + unsigned i; + + if (old->nr_blocks != new->v.nr_blocks) { + bch_err(c, "error updating stripe: nr_blocks does not match"); + ret = -EINVAL; + goto err; + } + + for (i = 0; i < new->v.nr_blocks; i++) { + unsigned v = stripe_blockcount_get(old, i); + + BUG_ON(v && + (old->ptrs[i].dev != new->v.ptrs[i].dev || + old->ptrs[i].gen != new->v.ptrs[i].gen || + old->ptrs[i].offset != new->v.ptrs[i].offset)); + + stripe_blockcount_set(&new->v, i, v); + } + } + + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int ec_stripe_update_extent(struct btree_trans *trans, + struct bpos bucket, u8 gen, + struct ec_stripe_buf *s, + struct bpos *bp_pos) +{ + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + struct bch_fs *c = trans->c; + struct bch_backpointer bp; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_extent_ptr *ptr_c; + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_stripe_ptr stripe_ptr; + struct bkey_i *n; + int ret, dev, block; + + ret = bch2_get_next_backpointer(trans, bucket, gen, + bp_pos, &bp, BTREE_ITER_CACHED); + if (ret) + return ret; + if (bpos_eq(*bp_pos, SPOS_MAX)) + return 0; + + if (bp.level) { + struct printbuf buf = PRINTBUF; + struct btree_iter node_iter; + struct btree *b; + + b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); + bch2_trans_iter_exit(trans, &node_iter); + + if (!b) + return 0; + + prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); + bch2_backpointer_to_text(&buf, &bp); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EIO; + } + + k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) { + /* + * extent no longer exists - we could flush the btree + * write buffer and retry to verify, but no need: + */ + return 0; + } + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) + goto out; + + ptr_c = bkey_matches_stripe(v, k, &block); + /* + * It doesn't generally make sense to erasure code cached ptrs: + * XXX: should we be incrementing a counter? + */ + if (!ptr_c || ptr_c->cached) + goto out; + + dev = v->ptrs[block].dev; + + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto out; + + bkey_reassemble(n, k); + + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); + ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); + BUG_ON(!ec_ptr); + + stripe_ptr = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, + .redundancy = v->nr_redundant, + .idx = s->key.k.p.offset, + }; + + __extent_entry_insert(n, + (union bch_extent_entry *) ec_ptr, + (union bch_extent_entry *) &stripe_ptr); + + ret = bch2_trans_update(trans, &iter, n, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, + unsigned block) +{ + struct bch_fs *c = trans->c; + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + struct bch_extent_ptr bucket = v->ptrs[block]; + struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); + struct bpos bp_pos = POS_MIN; + int ret = 0; + + while (1) { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, + ec_stripe_update_extent(trans, bucket_pos, bucket.gen, + s, &bp_pos)); + if (ret) + break; + if (bkey_eq(bp_pos, POS_MAX)) + break; + + bp_pos = bpos_nosnap_successor(bp_pos); + } + + return ret; +} + +static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + int ret = 0; + + ret = bch2_btree_write_buffer_flush(trans); + if (ret) + goto err; + + for (i = 0; i < nr_data; i++) { + ret = ec_stripe_update_bucket(trans, s, i); + if (ret) + break; + } +err: + bch2_trans_put(trans); + + return ret; +} + +static void zero_out_rest_of_ec_bucket(struct bch_fs *c, + struct ec_stripe_new *s, + unsigned block, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; + int ret; + + if (!bch2_dev_get_ioref(ca, WRITE)) { + s->err = -BCH_ERR_erofs_no_writes; + return; + } + + memset(s->new_stripe.data[block] + (offset << 9), + 0, + ob->sectors_free << 9); + + ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + ob->bucket * ca->mi.bucket_size + offset, + ob->sectors_free, + GFP_KERNEL, 0); + + percpu_ref_put(&ca->io_ref); + + if (ret) + s->err = ret; +} + +void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) +{ + if (s->idx) + bch2_stripe_close(c, s); + kfree(s); +} + +/* + * data buckets of new stripe all written: create the stripe + */ +static void ec_stripe_create(struct ec_stripe_new *s) +{ + struct bch_fs *c = s->c; + struct open_bucket *ob; + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + int ret; + + BUG_ON(s->h->s == s); + + closure_sync(&s->iodone); + + if (!s->err) { + for (i = 0; i < nr_data; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (ob->sectors_free) + zero_out_rest_of_ec_bucket(c, s, i, ob); + } + } + + if (s->err) { + if (!bch2_err_matches(s->err, EROFS)) + bch_err(c, "error creating stripe: error writing data buckets"); + goto err; + } + + if (s->have_existing_stripe) { + ec_validate_checksums(c, &s->existing_stripe); + + if (ec_do_recov(c, &s->existing_stripe)) { + bch_err(c, "error creating stripe: error reading existing stripe"); + goto err; + } + + for (i = 0; i < nr_data; i++) + if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) + swap(s->new_stripe.data[i], + s->existing_stripe.data[i]); + + ec_stripe_buf_exit(&s->existing_stripe); + } + + BUG_ON(!s->allocated); + BUG_ON(!s->idx); + + ec_generate_ec(&s->new_stripe); + + ec_generate_checksums(&s->new_stripe); + + /* write p/q: */ + for (i = nr_data; i < v->nr_blocks; i++) + ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); + closure_sync(&s->iodone); + + if (ec_nr_failed(&s->new_stripe)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); + goto err; + } + + ret = bch2_trans_do(c, &s->res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, + ec_stripe_key_update(trans, + bkey_i_to_stripe(&s->new_stripe.key), + !s->have_existing_stripe)); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err; + } + + ret = ec_stripe_update_extents(c, &s->new_stripe); + if (ret) { + bch_err_msg(c, ret, "creating stripe: error updating pointers"); + goto err; + } +err: + bch2_disk_reservation_put(c, &s->res); + + for (i = 0; i < v->nr_blocks; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (i < nr_data) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } else { + bch2_open_bucket_put(c, ob); + } + } + + mutex_lock(&c->ec_stripe_new_lock); + list_del(&s->list); + mutex_unlock(&c->ec_stripe_new_lock); + wake_up(&c->ec_stripe_new_wait); + + ec_stripe_buf_exit(&s->existing_stripe); + ec_stripe_buf_exit(&s->new_stripe); + closure_debug_destroy(&s->iodone); + + ec_stripe_new_put(c, s, STRIPE_REF_stripe); +} + +static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) +{ + struct ec_stripe_new *s; + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) + if (!atomic_read(&s->ref[STRIPE_REF_io])) + goto out; + s = NULL; +out: + mutex_unlock(&c->ec_stripe_new_lock); + + return s; +} + +static void ec_stripe_create_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, + struct bch_fs, ec_stripe_create_work); + struct ec_stripe_new *s; + + while ((s = get_pending_stripe(c))) + ec_stripe_create(s); + + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); +} + +void bch2_ec_do_stripe_creates(struct bch_fs *c) +{ + bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); + + if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); +} + +static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = h->s; + + BUG_ON(!s->allocated && !s->err); + + h->s = NULL; + s->pending = true; + + mutex_lock(&c->ec_stripe_new_lock); + list_add(&s->list, &c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + ec_stripe_new_put(c, s, STRIPE_REF_io); +} + +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + s->err = -EIO; +} + +void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct bch_dev *ca; + unsigned offset; + + if (!ob) + return NULL; + + BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); + + ca = bch_dev_bkey_exists(c, ob->dev); + offset = ca->mi.bucket_size - ob->sectors_free; + + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); +} + +static int unsigned_cmp(const void *_l, const void *_r) +{ + unsigned l = *((const unsigned *) _l); + unsigned r = *((const unsigned *) _r); + + return cmp_int(l, r); +} + +/* pick most common bucket size: */ +static unsigned pick_blocksize(struct bch_fs *c, + struct bch_devs_mask *devs) +{ + struct bch_dev *ca; + unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + struct { + unsigned nr, size; + } cur = { 0, 0 }, best = { 0, 0 }; + + for_each_member_device_rcu(ca, c, i, devs) + sizes[nr++] = ca->mi.bucket_size; + + sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); + + for (i = 0; i < nr; i++) { + if (sizes[i] != cur.size) { + if (cur.nr > best.nr) + best = cur; + + cur.nr = 0; + cur.size = sizes[i]; + } + + cur.nr++; + } + + if (cur.nr > best.nr) + best = cur; + + return best.size; +} + +static bool may_create_new_stripe(struct bch_fs *c) +{ + return false; +} + +static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i *k, + unsigned nr_data, + unsigned nr_parity, + unsigned stripe_size) +{ + struct bkey_i_stripe *s = bkey_stripe_init(k); + unsigned u64s; + + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = nr_data + nr_parity; + s->v.nr_redundant = nr_parity; + s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); + s->v.csum_type = BCH_CSUM_crc32c; + s->v.pad = 0; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { + BUG_ON(1 << s->v.csum_granularity_bits >= + le16_to_cpu(s->v.sectors) || + s->v.csum_granularity_bits == U8_MAX); + s->v.csum_granularity_bits++; + } + + set_bkey_val_u64s(&s->k, u64s); +} + +static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct ec_stripe_new *s; + + lockdep_assert_held(&h->lock); + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + + mutex_init(&s->lock); + closure_init(&s->iodone, NULL); + atomic_set(&s->ref[STRIPE_REF_stripe], 1); + atomic_set(&s->ref[STRIPE_REF_io], 1); + s->c = c; + s->h = h; + s->nr_data = min_t(unsigned, h->nr_active_devs, + BCH_BKEY_PTRS_MAX) - h->redundancy; + s->nr_parity = h->redundancy; + + ec_stripe_key_init(c, &s->new_stripe.key, + s->nr_data, s->nr_parity, h->blocksize); + + h->s = s; + return 0; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + unsigned algo, unsigned redundancy, + enum bch_watermark watermark) +{ + struct ec_stripe_head *h; + struct bch_dev *ca; + unsigned i; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + BUG_ON(!mutex_trylock(&h->lock)); + + h->target = target; + h->algo = algo; + h->redundancy = redundancy; + h->watermark = watermark; + + rcu_read_lock(); + h->devs = target_rw_devs(c, BCH_DATA_user, target); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (!ca->mi.durability) + __clear_bit(i, h->devs.d); + + h->blocksize = pick_blocksize(c, &h->devs); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + + rcu_read_unlock(); + + /* + * If we only have redundancy + 1 devices, we're better off with just + * replication: + */ + if (h->nr_active_devs < h->redundancy + 2) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", + h->nr_active_devs, h->redundancy + 2); + + list_add(&h->list, &c->ec_stripe_head_list); + return h; +} + +void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) +{ + if (h->s && + h->s->allocated && + bitmap_weight(h->s->blocks_allocated, + h->s->nr_data) == h->s->nr_data) + ec_stripe_set_pending(c, h); + + mutex_unlock(&h->lock); +} + +static struct ec_stripe_head * +__bch2_ec_stripe_head_get(struct btree_trans *trans, + unsigned target, + unsigned algo, + unsigned redundancy, + enum bch_watermark watermark) +{ + struct bch_fs *c = trans->c; + struct ec_stripe_head *h; + int ret; + + if (!redundancy) + return NULL; + + ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); + if (ret) + return ERR_PTR(ret); + + if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + h = ERR_PTR(-BCH_ERR_erofs_no_writes); + goto found; + } + + list_for_each_entry(h, &c->ec_stripe_head_list, list) + if (h->target == target && + h->algo == algo && + h->redundancy == redundancy && + h->watermark == watermark) { + ret = bch2_trans_mutex_lock(trans, &h->lock); + if (ret) + h = ERR_PTR(ret); + goto found; + } + + h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); +found: + if (!IS_ERR_OR_NULL(h) && + h->nr_active_devs < h->redundancy + 2) { + mutex_unlock(&h->lock); + h = NULL; + } + mutex_unlock(&c->ec_stripe_head_lock); + return h; +} + +static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, + enum bch_watermark watermark, struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; + struct open_buckets buckets; + struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; + bool have_cache = true; + int ret = 0; + + BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); + BUG_ON(v->nr_redundant != h->s->nr_parity); + + for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { + __clear_bit(v->ptrs[i].dev, devs.d); + if (i < h->s->nr_data) + nr_have_data++; + else + nr_have_parity++; + } + + BUG_ON(nr_have_data > h->s->nr_data); + BUG_ON(nr_have_parity > h->s->nr_parity); + + buckets.nr = 0; + if (nr_have_parity < h->s->nr_parity) { + ret = bch2_bucket_alloc_set_trans(trans, &buckets, + &h->parity_stripe, + &devs, + h->s->nr_parity, + &nr_have_parity, + &have_cache, 0, + BCH_DATA_parity, + watermark, + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data + h->s->nr_parity, + h->s->nr_data); + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; + v->ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + + if (ret) + return ret; + } + + buckets.nr = 0; + if (nr_have_data < h->s->nr_data) { + ret = bch2_bucket_alloc_set_trans(trans, &buckets, + &h->block_stripe, + &devs, + h->s->nr_data, + &nr_have_data, + &have_cache, 0, + BCH_DATA_user, + watermark, + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data, 0); + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; + v->ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + + if (ret) + return ret; + } + + return 0; +} + +/* XXX: doesn't obey target: */ +static s64 get_existing_stripe(struct bch_fs *c, + struct ec_stripe_head *head) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t heap_idx; + u64 stripe_idx; + s64 ret = -1; + + if (may_create_new_stripe(c)) + return -1; + + mutex_lock(&c->ec_stripes_heap_lock); + for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + /* No blocks worth reusing, stripe will just be deleted: */ + if (!h->data[heap_idx].blocks_nonempty) + continue; + + stripe_idx = h->data[heap_idx].idx; + + m = genradix_ptr(&c->stripes, stripe_idx); + + if (m->algorithm == head->algo && + m->nr_redundant == head->redundancy && + m->sectors == head->blocksize && + m->blocks_nonempty < m->nr_blocks - m->nr_redundant && + bch2_try_open_stripe(c, head->s, stripe_idx)) { + ret = stripe_idx; + break; + } + } + mutex_unlock(&c->ec_stripes_heap_lock); + return ret; +} + +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +{ + struct bch_fs *c = trans->c; + struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + struct bch_stripe *existing_v; + unsigned i; + s64 idx; + int ret; + + /* + * If we can't allocate a new stripe, and there's no stripes with empty + * blocks for us to reuse, that means we have to wait on copygc: + */ + idx = get_existing_stripe(c, h); + if (idx < 0) + return -BCH_ERR_stripe_alloc_blocked; + + ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); + if (ret) { + bch2_stripe_close(c, h->s); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret)); + return ret; + } + + existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; + + BUG_ON(existing_v->nr_redundant != h->s->nr_parity); + h->s->nr_data = existing_v->nr_blocks - + existing_v->nr_redundant; + + ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); + if (ret) { + bch2_stripe_close(c, h->s); + return ret; + } + + BUG_ON(h->s->existing_stripe.size != h->blocksize); + BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + + /* + * Free buckets we initially allocated - they might conflict with + * blocks from the stripe we're reusing: + */ + for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); + h->s->blocks[i] = 0; + } + memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); + memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); + + for (i = 0; i < existing_v->nr_blocks; i++) { + if (stripe_blockcount_get(existing_v, i)) { + __set_bit(i, h->s->blocks_gotten); + __set_bit(i, h->s->blocks_allocated); + } + + ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + } + + bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); + h->s->have_existing_stripe = true; + + return 0; +} + +static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + + if (!h->s->res.sectors) { + ret = bch2_disk_reservation_get(c, &h->s->res, + h->blocksize, + h->s->nr_parity, + BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; + } + + for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_gt(k.k->p, POS(0, U32_MAX))) { + if (start_pos.offset) { + start_pos = min_pos; + bch2_btree_iter_set_pos(&iter, start_pos); + continue; + } + + ret = -BCH_ERR_ENOSPC_stripe_create; + break; + } + + if (bkey_deleted(k.k) && + bch2_try_open_stripe(c, h->s, k.k->p.offset)) + break; + } + + c->ec_stripe_hint = iter.pos.offset; + + if (ret) + goto err; + + ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) { + bch2_stripe_close(c, h->s); + goto err; + } + + h->s->new_stripe.key.k.p = iter.pos; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +err: + bch2_disk_reservation_put(c, &h->s->res); + goto out; +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + unsigned target, + unsigned algo, + unsigned redundancy, + enum bch_watermark watermark, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct ec_stripe_head *h; + bool waiting = false; + int ret; + + h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); + if (IS_ERR_OR_NULL(h)) + return h; + + if (!h->s) { + ret = ec_new_stripe_alloc(c, h); + if (ret) { + bch_err(c, "failed to allocate new stripe"); + goto err; + } + } + + if (h->s->allocated) + goto allocated; + + if (h->s->have_existing_stripe) + goto alloc_existing; + + /* First, try to allocate a full stripe: */ + ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h); + if (!ret) + goto allocate_buf; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, ENOMEM)) + goto err; + + /* + * Not enough buckets available for a full stripe: we must reuse an + * existing stripe: + */ + while (1) { + ret = __bch2_ec_stripe_head_reuse(trans, h); + if (!ret) + break; + if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) + goto err; + + if (watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h); + if (ret) + goto err; + goto allocate_buf; + } + + /* XXX freelist_wait? */ + closure_wait(&c->freelist_wait, cl); + waiting = true; + } + + if (waiting) + closure_wake_up(&c->freelist_wait); +alloc_existing: + /* + * Retry allocating buckets, with the watermark for this + * particular write: + */ + ret = new_stripe_alloc_buckets(trans, h, watermark, cl); + if (ret) + goto err; + +allocate_buf: + ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); + if (ret) + goto err; + + h->s->allocated = true; +allocated: + BUG_ON(!h->s->idx); + BUG_ON(!h->s->new_stripe.data[0]); + BUG_ON(trans->restarted); + return h; +err: + bch2_ec_stripe_head_put(c, h); + return ERR_PTR(ret); +} + +static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) +{ + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + mutex_lock(&h->lock); + if (!h->s) + goto unlock; + + if (!ca) + goto found; + + for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { + if (!h->s->blocks[i]) + continue; + + ob = c->open_buckets + h->s->blocks[i]; + if (ob->dev == ca->dev_idx) + goto found; + } + goto unlock; +found: + h->s->err = -BCH_ERR_erofs_no_writes; + ec_stripe_set_pending(c, h); +unlock: + mutex_unlock(&h->lock); + } + mutex_unlock(&c->ec_stripe_head_lock); +} + +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +{ + __bch2_ec_stop(c, ca); +} + +void bch2_fs_ec_stop(struct bch_fs *c) +{ + __bch2_ec_stop(c, NULL); +} + +static bool bch2_fs_ec_flush_done(struct bch_fs *c) +{ + bool ret; + + mutex_lock(&c->ec_stripe_new_lock); + ret = list_empty(&c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + return ret; +} + +void bch2_fs_ec_flush(struct bch_fs *c) +{ + wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); +} + +int bch2_stripes_read(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_stripe *s; + struct stripe *m; + unsigned i; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; + + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; + + s = bkey_s_c_to_stripe(k).v; + + m = genradix_ptr(&c->stripes, k.k->p.offset); + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); + + bch2_stripes_heap_insert(c, m, k.k->p.offset); + } + bch2_trans_iter_exit(trans, &iter); + + bch2_trans_put(trans); + + if (ret) + bch_err_fn(c, ret); + + return ret; +} + +void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t i; + + mutex_lock(&c->ec_stripes_heap_lock); + for (i = 0; i < min_t(size_t, h->used, 50); i++) { + m = genradix_ptr(&c->stripes, h->data[i].idx); + + prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, + h->data[i].blocks_nonempty, + m->nr_blocks - m->nr_redundant, + m->nr_redundant); + if (bch2_stripe_is_open(c, h->data[i].idx)) + prt_str(out, " open"); + prt_newline(out); + } + mutex_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct ec_stripe_head *h; + struct ec_stripe_new *s; + + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + prt_printf(out, "target %u algo %u redundancy %u %s:\n", + h->target, h->algo, h->redundancy, + bch2_watermarks[h->watermark]); + + if (h->s) + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", + h->s->idx, h->s->nr_data, h->s->nr_parity, + bitmap_weight(h->s->blocks_allocated, + h->s->nr_data)); + } + mutex_unlock(&c->ec_stripe_head_lock); + + prt_printf(out, "in flight:\n"); + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) { + prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", + s->idx, s->nr_data, s->nr_parity, + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_watermarks[s->h->watermark]); + } + mutex_unlock(&c->ec_stripe_new_lock); +} + +void bch2_fs_ec_exit(struct bch_fs *c) +{ + struct ec_stripe_head *h; + unsigned i; + + while (1) { + mutex_lock(&c->ec_stripe_head_lock); + h = list_first_entry_or_null(&c->ec_stripe_head_list, + struct ec_stripe_head, list); + if (h) + list_del(&h->list); + mutex_unlock(&c->ec_stripe_head_lock); + if (!h) + break; + + if (h->s) { + for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) + BUG_ON(h->s->blocks[i]); + + kfree(h->s); + } + kfree(h); + } + + BUG_ON(!list_empty(&c->ec_stripe_new_list)); + + free_heap(&c->ec_stripes_heap); + genradix_free(&c->stripes); + bioset_exit(&c->ec_bioset); +} + +void bch2_fs_ec_init_early(struct bch_fs *c) +{ + spin_lock_init(&c->ec_stripes_new_lock); + mutex_init(&c->ec_stripes_heap_lock); + + INIT_LIST_HEAD(&c->ec_stripe_head_list); + mutex_init(&c->ec_stripe_head_lock); + + INIT_LIST_HEAD(&c->ec_stripe_new_list); + mutex_init(&c->ec_stripe_new_lock); + init_waitqueue_head(&c->ec_stripe_new_wait); + + INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); + INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); +} + +int bch2_fs_ec_init(struct bch_fs *c) +{ + return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), + BIOSET_NEED_BVECS); +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 index 000000000000..7d0237c9819f --- /dev/null +++ b/fs/bcachefs/ec.h @@ -0,0 +1,260 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H + +#include "ec_types.h" +#include "buckets_types.h" +#include "extents_types.h" + +enum bkey_invalid_flags; + +int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +#define bch2_bkey_ops_stripe ((struct bkey_ops) { \ + .key_invalid = bch2_stripe_invalid, \ + .val_to_text = bch2_stripe_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_stripe, \ + .atomic_trigger = bch2_mark_stripe, \ + .min_val_size = 8, \ +}) + +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(le16_to_cpu(s->sectors), + 1 << s->csum_granularity_bits); +} + +static inline unsigned stripe_csum_offset(const struct bch_stripe *s, + unsigned dev, unsigned csum_idx) +{ + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; + + return sizeof(struct bch_stripe) + + sizeof(struct bch_extent_ptr) * s->nr_blocks + + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; +} + +static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, + unsigned idx) +{ + return stripe_csum_offset(s, s->nr_blocks, 0) + + sizeof(u16) * idx; +} + +static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, + unsigned idx) +{ + return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); +} + +static inline void stripe_blockcount_set(struct bch_stripe *s, + unsigned idx, unsigned v) +{ + __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); + + *p = cpu_to_le16(v); +} + +static inline unsigned stripe_val_u64s(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), + sizeof(u64)); +} + +static inline void *stripe_csum(struct bch_stripe *s, + unsigned block, unsigned csum_idx) +{ + EBUG_ON(block >= s->nr_blocks); + EBUG_ON(csum_idx >= stripe_csums_per_device(s)); + + return (void *) s + stripe_csum_offset(s, block, csum_idx); +} + +static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, + unsigned block, unsigned csum_idx) +{ + struct bch_csum csum = { 0 }; + + memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); + return csum; +} + +static inline void stripe_csum_set(struct bch_stripe *s, + unsigned block, unsigned csum_idx, + struct bch_csum csum) +{ + memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); +} + +static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, + const struct bch_extent_ptr *data_ptr, + unsigned sectors) +{ + return data_ptr->dev == stripe_ptr->dev && + data_ptr->gen == stripe_ptr->gen && + data_ptr->offset >= stripe_ptr->offset && + data_ptr->offset < stripe_ptr->offset + sectors; +} + +static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, + struct extent_ptr_decoded p) +{ + unsigned nr_data = s->nr_blocks - s->nr_redundant; + + BUG_ON(!p.has_ec); + + if (p.ec.block >= nr_data) + return false; + + return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, + le16_to_cpu(s->sectors)); +} + +static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, + struct extent_ptr_decoded p) +{ + unsigned nr_data = m->nr_blocks - m->nr_redundant; + + BUG_ON(!p.has_ec); + + if (p.ec.block >= nr_data) + return false; + + return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, + m->sectors); +} + +struct bch_read_bio; + +struct ec_stripe_buf { + /* might not be buffering the entire stripe: */ + unsigned offset; + unsigned size; + unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + + void *data[BCH_BKEY_PTRS_MAX]; + + __BKEY_PADDED(key, 255); +}; + +struct ec_stripe_head; + +enum ec_stripe_ref { + STRIPE_REF_io, + STRIPE_REF_stripe, + STRIPE_REF_NR +}; + +struct ec_stripe_new { + struct bch_fs *c; + struct ec_stripe_head *h; + struct mutex lock; + struct list_head list; + + struct hlist_node hash; + u64 idx; + + struct closure iodone; + + atomic_t ref[STRIPE_REF_NR]; + + int err; + + u8 nr_data; + u8 nr_parity; + bool allocated; + bool pending; + bool have_existing_stripe; + + unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; + struct disk_reservation res; + + struct ec_stripe_buf new_stripe; + struct ec_stripe_buf existing_stripe; +}; + +struct ec_stripe_head { + struct list_head list; + struct mutex lock; + + unsigned target; + unsigned algo; + unsigned redundancy; + enum bch_watermark watermark; + + struct bch_devs_mask devs; + unsigned nr_active_devs; + + unsigned blocksize; + + struct dev_stripe_state block_stripe; + struct dev_stripe_state parity_stripe; + + struct ec_stripe_new *s; +}; + +int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *); + +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); + +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); + +int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + +void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, + unsigned, unsigned, unsigned, + enum bch_watermark, struct closure *); + +void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); +void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); +void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); + +void bch2_do_stripe_deletes(struct bch_fs *); +void bch2_ec_do_stripe_creates(struct bch_fs *); +void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); + +static inline void ec_stripe_new_get(struct ec_stripe_new *s, + enum ec_stripe_ref ref) +{ + atomic_inc(&s->ref[ref]); +} + +static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, + enum ec_stripe_ref ref) +{ + BUG_ON(atomic_read(&s->ref[ref]) <= 0); + + if (atomic_dec_and_test(&s->ref[ref])) + switch (ref) { + case STRIPE_REF_stripe: + bch2_ec_stripe_new_free(c, s); + break; + case STRIPE_REF_io: + bch2_ec_do_stripe_creates(c); + break; + default: + BUG(); + } +} + +void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); +void bch2_fs_ec_stop(struct bch_fs *); +void bch2_fs_ec_flush(struct bch_fs *); + +int bch2_stripes_read(struct bch_fs *); + +void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); +void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); + +void bch2_fs_ec_exit(struct bch_fs *); +void bch2_fs_ec_init_early(struct bch_fs *); +int bch2_fs_ec_init(struct bch_fs *); + +#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h new file mode 100644 index 000000000000..e2b02a82de32 --- /dev/null +++ b/fs/bcachefs/ec_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_TYPES_H +#define _BCACHEFS_EC_TYPES_H + +#include "bcachefs_format.h" + +struct bch_replicas_padded { + struct bch_replicas_entry e; + u8 pad[BCH_BKEY_PTRS_MAX]; +}; + +struct stripe { + size_t heap_idx; + u16 sectors; + u8 algorithm; + u8 nr_blocks; + u8 nr_redundant; + u8 blocks_nonempty; +}; + +struct gc_stripe { + u16 sectors; + + u8 nr_blocks; + u8 nr_redundant; + + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ + u16 block_sectors[BCH_BKEY_PTRS_MAX]; + struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; + + struct bch_replicas_padded r; +}; + +struct ec_stripe_heap_entry { + size_t idx; + unsigned blocks_nonempty; +}; + +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; + +#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c new file mode 100644 index 000000000000..d260ff9bbfeb --- /dev/null +++ b/fs/bcachefs/errcode.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "errcode.h" + +#include <linux/errname.h> + +static const char * const bch2_errcode_strs[] = { +#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, + BCH_ERRCODES() +#undef x + NULL +}; + +static unsigned bch2_errcode_parents[] = { +#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, + BCH_ERRCODES() +#undef x +}; + +const char *bch2_err_str(int err) +{ + const char *errstr; + + err = abs(err); + + BUG_ON(err >= BCH_ERR_MAX); + + if (err >= BCH_ERR_START) + errstr = bch2_errcode_strs[err - BCH_ERR_START]; + else if (err) + errstr = errname(err); + else + errstr = "(No error)"; + return errstr ?: "(Invalid error)"; +} + +bool __bch2_err_matches(int err, int class) +{ + err = abs(err); + class = abs(class); + + BUG_ON(err >= BCH_ERR_MAX); + BUG_ON(class >= BCH_ERR_MAX); + + while (err >= BCH_ERR_START && err != class) + err = bch2_errcode_parents[err - BCH_ERR_START]; + + return err == class; +} + +int __bch2_err_class(int err) +{ + err = -err; + BUG_ON((unsigned) err >= BCH_ERR_MAX); + + while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) + err = bch2_errcode_parents[err - BCH_ERR_START]; + + return -err; +} + +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h new file mode 100644 index 000000000000..ae7910bf2228 --- /dev/null +++ b/fs/bcachefs/errcode.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H + +#define BCH_ERRCODES() \ + x(ERANGE, ERANGE_option_too_small) \ + x(ERANGE, ERANGE_option_too_big) \ + x(ENOMEM, ENOMEM_stripe_buf) \ + x(ENOMEM, ENOMEM_replicas_table) \ + x(ENOMEM, ENOMEM_cpu_replicas) \ + x(ENOMEM, ENOMEM_replicas_gc) \ + x(ENOMEM, ENOMEM_disk_groups_validate) \ + x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ + x(ENOMEM, ENOMEM_mark_snapshot) \ + x(ENOMEM, ENOMEM_mark_stripe) \ + x(ENOMEM, ENOMEM_mark_stripe_ptr) \ + x(ENOMEM, ENOMEM_btree_key_cache_create) \ + x(ENOMEM, ENOMEM_btree_key_cache_fill) \ + x(ENOMEM, ENOMEM_btree_key_cache_insert) \ + x(ENOMEM, ENOMEM_trans_kmalloc) \ + x(ENOMEM, ENOMEM_trans_log_msg) \ + x(ENOMEM, ENOMEM_do_encrypt) \ + x(ENOMEM, ENOMEM_ec_read_extent) \ + x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ + x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ + x(ENOMEM, ENOMEM_fs_btree_cache_init) \ + x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ + x(ENOMEM, ENOMEM_fs_counters_init) \ + x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ + x(ENOMEM, ENOMEM_io_clock_init) \ + x(ENOMEM, ENOMEM_blacklist_table_init) \ + x(ENOMEM, ENOMEM_sb_realloc_injected) \ + x(ENOMEM, ENOMEM_sb_bio_realloc) \ + x(ENOMEM, ENOMEM_sb_buf_realloc) \ + x(ENOMEM, ENOMEM_sb_journal_validate) \ + x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ + x(ENOMEM, ENOMEM_journal_entry_add) \ + x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ + x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ + x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ + x(ENOMEM, ENOMEM_bio_read_init) \ + x(ENOMEM, ENOMEM_bio_read_split_init) \ + x(ENOMEM, ENOMEM_bio_write_init) \ + x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ + x(ENOMEM, ENOMEM_writepage_bioset_init) \ + x(ENOMEM, ENOMEM_dio_read_bioset_init) \ + x(ENOMEM, ENOMEM_dio_write_bioset_init) \ + x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ + x(ENOMEM, ENOMEM_promote_table_init) \ + x(ENOMEM, ENOMEM_compression_bounce_read_init) \ + x(ENOMEM, ENOMEM_compression_bounce_write_init) \ + x(ENOMEM, ENOMEM_compression_workspace_init) \ + x(ENOMEM, ENOMEM_decompression_workspace_init) \ + x(ENOMEM, ENOMEM_bucket_gens) \ + x(ENOMEM, ENOMEM_buckets_nouse) \ + x(ENOMEM, ENOMEM_usage_init) \ + x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ + x(ENOMEM, ENOMEM_btree_node_reclaim) \ + x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ + x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ + x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ + x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ + x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ + x(ENOMEM, ENOMEM_dev_journal_init) \ + x(ENOMEM, ENOMEM_journal_pin_fifo) \ + x(ENOMEM, ENOMEM_journal_buf) \ + x(ENOMEM, ENOMEM_gc_start) \ + x(ENOMEM, ENOMEM_gc_alloc_start) \ + x(ENOMEM, ENOMEM_gc_reflink_start) \ + x(ENOMEM, ENOMEM_gc_gens) \ + x(ENOMEM, ENOMEM_gc_repair_key) \ + x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ + x(ENOMEM, ENOMEM_fsck_add_nlink) \ + x(ENOMEM, ENOMEM_journal_key_insert) \ + x(ENOMEM, ENOMEM_journal_keys_sort) \ + x(ENOMEM, ENOMEM_journal_replay) \ + x(ENOMEM, ENOMEM_read_superblock_clean) \ + x(ENOMEM, ENOMEM_fs_alloc) \ + x(ENOMEM, ENOMEM_fs_name_alloc) \ + x(ENOMEM, ENOMEM_fs_other_alloc) \ + x(ENOMEM, ENOMEM_dev_alloc) \ + x(ENOSPC, ENOSPC_disk_reservation) \ + x(ENOSPC, ENOSPC_bucket_alloc) \ + x(ENOSPC, ENOSPC_disk_label_add) \ + x(ENOSPC, ENOSPC_stripe_create) \ + x(ENOSPC, ENOSPC_inode_create) \ + x(ENOSPC, ENOSPC_str_hash_create) \ + x(ENOSPC, ENOSPC_snapshot_create) \ + x(ENOSPC, ENOSPC_subvolume_create) \ + x(ENOSPC, ENOSPC_sb) \ + x(ENOSPC, ENOSPC_sb_journal) \ + x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ + x(ENOSPC, ENOSPC_sb_quota) \ + x(ENOSPC, ENOSPC_sb_replicas) \ + x(ENOSPC, ENOSPC_sb_members) \ + x(ENOSPC, ENOSPC_sb_members_v2) \ + x(ENOSPC, ENOSPC_sb_crypt) \ + x(ENOSPC, ENOSPC_btree_slot) \ + x(ENOSPC, ENOSPC_snapshot_tree) \ + x(ENOENT, ENOENT_bkey_type_mismatch) \ + x(ENOENT, ENOENT_str_hash_lookup) \ + x(ENOENT, ENOENT_str_hash_set_must_replace) \ + x(ENOENT, ENOENT_inode) \ + x(ENOENT, ENOENT_not_subvol) \ + x(ENOENT, ENOENT_not_directory) \ + x(ENOENT, ENOENT_directory_dead) \ + x(ENOENT, ENOENT_subvolume) \ + x(ENOENT, ENOENT_snapshot_tree) \ + x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ + x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_idx_not_found) \ + x(0, open_buckets_empty) \ + x(0, freelist_empty) \ + x(BCH_ERR_freelist_empty, no_buckets_found) \ + x(0, transaction_restart) \ + x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ + x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ + x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ + x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ + x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ + x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ + x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ + x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ + x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ + x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(0, no_btree_node) \ + x(BCH_ERR_no_btree_node, no_btree_node_relock) \ + x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ + x(BCH_ERR_no_btree_node, no_btree_node_drop) \ + x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ + x(BCH_ERR_no_btree_node, no_btree_node_up) \ + x(BCH_ERR_no_btree_node, no_btree_node_down) \ + x(BCH_ERR_no_btree_node, no_btree_node_init) \ + x(BCH_ERR_no_btree_node, no_btree_node_cached) \ + x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ + x(0, btree_insert_fail) \ + x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ + x(0, backpointer_to_overwritten_btree_node) \ + x(0, lock_fail_root_changed) \ + x(0, journal_reclaim_would_deadlock) \ + x(EINVAL, fsck) \ + x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_ignore) \ + x(BCH_ERR_fsck, fsck_errors_not_fixed) \ + x(BCH_ERR_fsck, fsck_repair_unimplemented) \ + x(BCH_ERR_fsck, fsck_repair_impossible) \ + x(0, restart_recovery) \ + x(0, data_update_done) \ + x(EINVAL, device_state_not_allowed) \ + x(EINVAL, member_info_missing) \ + x(EINVAL, mismatched_block_size) \ + x(EINVAL, block_size_too_small) \ + x(EINVAL, bucket_size_too_small) \ + x(EINVAL, device_size_too_small) \ + x(EINVAL, device_not_a_member_of_filesystem) \ + x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_already_online) \ + x(EINVAL, insufficient_devices_to_start) \ + x(EINVAL, invalid) \ + x(EINVAL, internal_fsck_err) \ + x(EROFS, erofs_trans_commit) \ + x(EROFS, erofs_no_writes) \ + x(EROFS, erofs_journal_err) \ + x(EROFS, erofs_sb_err) \ + x(EROFS, erofs_unfixed_errors) \ + x(EROFS, erofs_norecovery) \ + x(EROFS, erofs_nochanges) \ + x(EROFS, insufficient_devices) \ + x(0, operation_blocked) \ + x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ + x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ + x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_invalid, invalid_sb) \ + x(BCH_ERR_invalid_sb, invalid_sb_magic) \ + x(BCH_ERR_invalid_sb, invalid_sb_version) \ + x(BCH_ERR_invalid_sb, invalid_sb_features) \ + x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ + x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ + x(BCH_ERR_invalid_sb, invalid_sb_csum) \ + x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ + x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ + x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ + x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ + x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ + x(BCH_ERR_invalid_sb, invalid_sb_layout) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ + x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ + x(BCH_ERR_invalid_sb, invalid_sb_members) \ + x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ + x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ + x(BCH_ERR_invalid_sb, invalid_replicas_entry) \ + x(BCH_ERR_invalid_sb, invalid_sb_journal) \ + x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ + x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ + x(BCH_ERR_invalid_sb, invalid_sb_clean) \ + x(BCH_ERR_invalid_sb, invalid_sb_quota) \ + x(BCH_ERR_invalid_sb, invalid_sb_errors) \ + x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \ + x(BCH_ERR_invalid, invalid_bkey) \ + x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) \ + x(0, nopromote) \ + x(BCH_ERR_nopromote, nopromote_may_not) \ + x(BCH_ERR_nopromote, nopromote_already_promoted) \ + x(BCH_ERR_nopromote, nopromote_unwritten) \ + x(BCH_ERR_nopromote, nopromote_congested) \ + x(BCH_ERR_nopromote, nopromote_in_flight) \ + x(BCH_ERR_nopromote, nopromote_enomem) + +enum bch_errcode { + BCH_ERR_START = 2048, +#define x(class, err) BCH_ERR_##err, + BCH_ERRCODES() +#undef x + BCH_ERR_MAX +}; + +const char *bch2_err_str(int); +bool __bch2_err_matches(int, int); + +static inline bool _bch2_err_matches(int err, int class) +{ + return err < 0 && __bch2_err_matches(err, class); +} + +#define bch2_err_matches(_err, _class) \ +({ \ + BUILD_BUG_ON(!__builtin_constant_p(_class)); \ + unlikely(_bch2_err_matches(_err, _class)); \ +}) + +int __bch2_err_class(int); + +static inline long bch2_err_class(long err) +{ + return err < 0 ? __bch2_err_class(err) : err; +} + +#define BLK_STS_REMOVED ((__force blk_status_t)128) + +const char *bch2_blk_status_to_str(blk_status_t); + +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c new file mode 100644 index 000000000000..7b28d37922fd --- /dev/null +++ b/fs/bcachefs/error.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "error.h" +#include "super.h" + +#define FSCK_ERR_RATELIMIT_NR 10 + +bool bch2_inconsistent_error(struct bch_fs *c) +{ + set_bit(BCH_FS_ERROR, &c->flags); + + switch (c->opts.errors) { + case BCH_ON_ERROR_continue: + return false; + case BCH_ON_ERROR_ro: + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "inconsistency detected - emergency read only"); + return true; + case BCH_ON_ERROR_panic: + panic(bch2_fmt(c, "panic after error")); + return true; + default: + BUG(); + } +} + +void bch2_topology_error(struct bch_fs *c) +{ + set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + bch2_inconsistent_error(c); +} + +void bch2_fatal_error(struct bch_fs *c) +{ + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "fatal error - emergency read only"); +} + +void bch2_io_error_work(struct work_struct *work) +{ + struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); + struct bch_fs *c = ca->fs; + bool dev; + + down_write(&c->state_lock); + dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + if (dev + ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED) + : bch2_fs_emergency_read_only(c)) + bch_err(ca, + "too many IO errors, setting %s RO", + dev ? "device" : "filesystem"); + up_write(&c->state_lock); +} + +void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) +{ + atomic64_inc(&ca->errors[type]); + //queue_work(system_long_wq, &ca->io_error_work); +} + +enum ask_yn { + YN_NO, + YN_YES, + YN_ALLNO, + YN_ALLYES, +}; + +#ifdef __KERNEL__ +#define bch2_fsck_ask_yn() YN_NO +#else + +#include "tools-util.h" + +enum ask_yn bch2_fsck_ask_yn(void) +{ + char *buf = NULL; + size_t buflen = 0; + bool ret; + + while (true) { + fputs(" (y,n, or Y,N for all errors of this type) ", stdout); + fflush(stdout); + + if (getline(&buf, &buflen, stdin) < 0) + die("error reading from standard input"); + + strim(buf); + if (strlen(buf) != 1) + continue; + + switch (buf[0]) { + case 'n': + return YN_NO; + case 'y': + return YN_YES; + case 'N': + return YN_ALLNO; + case 'Y': + return YN_ALLYES; + } + } + + free(buf); + return ret; +} + +#endif + +static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) +{ + struct fsck_err_state *s; + + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + return NULL; + + list_for_each_entry(s, &c->fsck_error_msgs, list) + if (s->fmt == fmt) { + /* + * move it to the head of the list: repeated fsck errors + * are common + */ + list_move(&s->list, &c->fsck_error_msgs); + return s; + } + + s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) { + if (!c->fsck_alloc_msgs_err) + bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); + c->fsck_alloc_msgs_err = true; + return NULL; + } + + INIT_LIST_HEAD(&s->list); + s->fmt = fmt; + list_add(&s->list, &c->fsck_error_msgs); + return s; +} + +int bch2_fsck_err(struct bch_fs *c, + enum bch_fsck_flags flags, + enum bch_sb_error_id err, + const char *fmt, ...) +{ + struct fsck_err_state *s = NULL; + va_list args; + bool print = true, suppressing = false, inconsistent = false; + struct printbuf buf = PRINTBUF, *out = &buf; + int ret = -BCH_ERR_fsck_ignore; + + bch2_sb_error_count(c, err); + + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + mutex_lock(&c->fsck_error_msgs_lock); + s = fsck_err_get(c, fmt); + if (s) { + /* + * We may be called multiple times for the same error on + * transaction restart - this memoizes instead of asking the user + * multiple times for the same error: + */ + if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { + ret = s->ret; + mutex_unlock(&c->fsck_error_msgs_lock); + printbuf_exit(&buf); + return ret; + } + + kfree(s->last_msg); + s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + + if (c->opts.ratelimit_errors && + !(flags & FSCK_NO_RATELIMIT) && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + suppressing = true; + else + print = false; + } + + s->nr++; + } + +#ifdef BCACHEFS_LOG_PREFIX + if (!strncmp(fmt, "bcachefs:", 9)) + prt_printf(out, bch2_log_msg(c, "")); +#endif + + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (c->opts.errors != BCH_ON_ERROR_continue || + !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { + prt_str(out, ", shutting down"); + inconsistent = true; + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } + } else if (c->opts.fix_errors == FSCK_FIX_exit) { + prt_str(out, ", exiting"); + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { + int fix = s && s->fix + ? s->fix + : c->opts.fix_errors; + + if (fix == FSCK_FIX_ask) { + int ask; + + prt_str(out, ": fix?"); + bch2_print_string_as_lines(KERN_ERR, out->buf); + print = false; + + ask = bch2_fsck_ask_yn(); + + if (ask >= YN_ALLNO && s) + s->fix = ask == YN_ALLNO + ? FSCK_FIX_no + : FSCK_FIX_yes; + + ret = ask & 1 + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; + } else if (fix == FSCK_FIX_yes || + (c->opts.nochanges && + !(flags & FSCK_CAN_IGNORE))) { + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", not fixing"); + } + } else if (flags & FSCK_NEED_FSCK) { + prt_str(out, " (run fsck to correct)"); + } else { + prt_str(out, " (repair unimplemented)"); + } + + if (ret == -BCH_ERR_fsck_ignore && + (c->opts.fix_errors == FSCK_FIX_exit || + !(flags & FSCK_CAN_IGNORE))) + ret = -BCH_ERR_fsck_errors_not_fixed; + + if (print) + bch2_print_string_as_lines(KERN_ERR, out->buf); + + if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore)) + bch_err(c, "Unable to continue, halting"); + else if (suppressing) + bch_err(c, "Ratelimiting new instances of previous error"); + + if (s) + s->ret = ret; + + mutex_unlock(&c->fsck_error_msgs_lock); + + printbuf_exit(&buf); + + if (inconsistent) + bch2_inconsistent_error(c); + + if (ret == -BCH_ERR_fsck_fix) { + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + } else { + set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_ERROR, &c->flags); + } + + return ret; +} + +void bch2_flush_fsck_errs(struct bch_fs *c) +{ + struct fsck_err_state *s, *n; + + mutex_lock(&c->fsck_error_msgs_lock); + + list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { + if (s->ratelimited && s->last_msg) + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); + + list_del(&s->list); + kfree(s->last_msg); + kfree(s); + } + + mutex_unlock(&c->fsck_error_msgs_lock); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h new file mode 100644 index 000000000000..d167d65986e0 --- /dev/null +++ b/fs/bcachefs/error.h @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERROR_H +#define _BCACHEFS_ERROR_H + +#include <linux/list.h> +#include <linux/printk.h> +#include "sb-errors.h" + +struct bch_dev; +struct bch_fs; +struct work_struct; + +/* + * XXX: separate out errors that indicate on disk data is inconsistent, and flag + * superblock as such + */ + +/* Error messages: */ + +/* + * Inconsistency errors: The on disk data is inconsistent. If these occur during + * initial recovery, they don't indicate a bug in the running code - we walk all + * the metadata before modifying anything. If they occur at runtime, they + * indicate either a bug in the running code or (less likely) data is being + * silently corrupted under us. + * + * XXX: audit all inconsistent errors and make sure they're all recoverable, in + * BCH_ON_ERROR_CONTINUE mode + */ + +bool bch2_inconsistent_error(struct bch_fs *); + +void bch2_topology_error(struct bch_fs *); + +#define bch2_fs_inconsistent(c, ...) \ +({ \ + bch_err(c, __VA_ARGS__); \ + bch2_inconsistent_error(c); \ +}) + +#define bch2_fs_inconsistent_on(cond, c, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_inconsistent(c, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Later we might want to mark only the particular device inconsistent, not the + * entire filesystem: + */ + +#define bch2_dev_inconsistent(ca, ...) \ +do { \ + bch_err(ca, __VA_ARGS__); \ + bch2_inconsistent_error((ca)->fs); \ +} while (0) + +#define bch2_dev_inconsistent_on(cond, ca, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_dev_inconsistent(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* + * When a transaction update discovers or is causing a fs inconsistency, it's + * helpful to also dump the pending updates: + */ +#define bch2_trans_inconsistent(trans, ...) \ +({ \ + bch_err(trans->c, __VA_ARGS__); \ + bch2_dump_trans_updates(trans); \ + bch2_inconsistent_error(trans->c); \ +}) + +#define bch2_trans_inconsistent_on(cond, trans, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_trans_inconsistent(trans, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Fsck errors: inconsistency errors we detect at mount time, and should ideally + * be able to repair: + */ + +struct fsck_err_state { + struct list_head list; + const char *fmt; + u64 nr; + bool ratelimited; + int ret; + int fix; + char *last_msg; +}; + +enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, + FSCK_NEED_FSCK = 1 << 2, + FSCK_NO_RATELIMIT = 1 << 3, +}; + +#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) + +__printf(4, 5) __cold +int bch2_fsck_err(struct bch_fs *, + enum bch_fsck_flags, + enum bch_sb_error_id, + const char *, ...); +void bch2_flush_fsck_errs(struct bch_fs *); + +#define __fsck_err(c, _flags, _err_type, ...) \ +({ \ + int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \ + __VA_ARGS__); \ + \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) { \ + ret = _ret; \ + goto fsck_err; \ + } \ + \ + _ret == -BCH_ERR_fsck_fix; \ +}) + +/* These macros return true if error should be fixed: */ + +/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ + +#define __fsck_err_on(cond, c, _flags, _err_type, ...) \ + (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false) + +#define need_fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + +#define need_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) + +#define mustfix_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) + +#define mustfix_fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) + +#define fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + +#define fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + +static inline void bch2_bkey_fsck_err(struct bch_fs *c, + struct printbuf *err_msg, + enum bch_sb_error_id err_type, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + prt_vprintf(err_msg, fmt, args); + va_end(args); + +} + +#define bkey_fsck_err(c, _err_msg, _err_type, ...) \ +do { \ + prt_printf(_err_msg, __VA_ARGS__); \ + bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \ + ret = -BCH_ERR_invalid_bkey; \ + goto fsck_err; \ +} while (0) + +#define bkey_fsck_err_on(cond, ...) \ +do { \ + if (unlikely(cond)) \ + bkey_fsck_err(__VA_ARGS__); \ +} while (0) + +/* + * Fatal errors: these don't indicate a bug, but we can't continue running in RW + * mode - pretty much just due to metadata IO errors: + */ + +void bch2_fatal_error(struct bch_fs *); + +#define bch2_fs_fatal_error(c, ...) \ +do { \ + bch_err(c, __VA_ARGS__); \ + bch2_fatal_error(c); \ +} while (0) + +#define bch2_fs_fatal_err_on(cond, c, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_fatal_error(c, __VA_ARGS__); \ + _ret; \ +}) + +/* + * IO errors: either recoverable metadata IO (because we have replicas), or data + * IO - we need to log it and print out a message, but we don't (necessarily) + * want to shut down the fs: + */ + +void bch2_io_error_work(struct work_struct *); + +/* Does the error handling without logging a message */ +void bch2_io_error(struct bch_dev *, enum bch_member_error_type); + +#define bch2_dev_io_err_on(cond, ca, _type, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) { \ + bch_err_dev_ratelimited(ca, __VA_ARGS__); \ + bch2_io_error(ca, _type); \ + } \ + _ret; \ +}) + +#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) { \ + bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ + bch2_io_error(ca, _type); \ + } \ + _ret; \ +}) + +#endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c new file mode 100644 index 000000000000..21af6fb8cecf --- /dev/null +++ b/fs/bcachefs/extent_update.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "debug.h" +#include "extents.h" +#include "extent_update.h" + +/* + * This counts the number of iterators to the alloc & ec btrees we'll need + * inserting/removing this extent: + */ +static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + unsigned ret = 0, lru = 0; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + /* Might also be updating LRU btree */ + if (entry->ptr.cached) + lru++; + + fallthrough; + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } + + /* + * Updating keys in the alloc btree may also update keys in the + * freespace or discard btrees: + */ + return lru + ret * 2; +} + +static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters) +{ + int ret = 0, ret2 = 0; + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } + + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); + struct btree_iter iter; + struct bkey_s_c r_k; + + for_each_btree_key_norestart(trans, iter, + BTREE_ID_reflink, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret2) { + if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) + break; + + /* extent_update_to_keys(), for the reflink_v update */ + *nr_iters += 1; + + *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); + + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += min_t(u64, k.k->size, + r_k.k->p.offset - idx); + + *end = bpos_min(*end, pos); + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + break; + } + } + + return ret2 ?: ret; +} + +#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) + +int bch2_extent_atomic_end(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) +{ + struct btree_iter copy; + struct bkey_s_c k; + unsigned nr_iters = 0; + int ret; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + *end = insert->k.p; + + /* extent_update_to_keys(): */ + nr_iters += 1; + + ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, + &nr_iters, EXTENT_ITERS_MAX / 2); + if (ret < 0) + return ret; + + bch2_trans_copy_iter(©, iter); + + for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { + unsigned offset = 0; + + if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + + /* extent_handle_overwrites(): */ + switch (bch2_extent_overlap(&insert->k, k.k)) { + case BCH_EXTENT_OVERLAP_ALL: + case BCH_EXTENT_OVERLAP_FRONT: + nr_iters += 1; + break; + case BCH_EXTENT_OVERLAP_BACK: + case BCH_EXTENT_OVERLAP_MIDDLE: + nr_iters += 2; + break; + } + + ret = count_iters_for_insert(trans, k, offset, end, + &nr_iters, EXTENT_ITERS_MAX); + if (ret) + break; + } + + bch2_trans_iter_exit(trans, ©); + return ret < 0 ? ret : 0; +} + +int bch2_extent_trim_atomic(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k) +{ + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(trans, iter, k, &end); + if (ret) + return ret; + + bch2_cut_back(end, k); + return 0; +} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h new file mode 100644 index 000000000000..6f5cf449361a --- /dev/null +++ b/fs/bcachefs/extent_update.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENT_UPDATE_H +#define _BCACHEFS_EXTENT_UPDATE_H + +#include "bcachefs.h" + +int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bpos *); +int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, + struct bkey_i *); + +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 index 000000000000..9d8afcb5979a --- /dev/null +++ b/fs/bcachefs/extents.c @@ -0,0 +1,1511 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Code for managing the extent btree and dynamically updating the writeback + * dirty sector count. + */ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_gc.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "buckets.h" +#include "checksum.h" +#include "compress.h" +#include "debug.h" +#include "disk_groups.h" +#include "error.h" +#include "extents.h" +#include "inode.h" +#include "journal.h" +#include "replicas.h" +#include "super.h" +#include "super-io.h" +#include "trace.h" +#include "util.h" + +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; + +static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); + +static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, + unsigned dev) +{ + struct bch_dev_io_failures *i; + + for (i = f->devs; i < f->devs + f->nr; i++) + if (i->dev == dev) + return i; + + return NULL; +} + +void bch2_mark_io_failure(struct bch_io_failures *failed, + struct extent_ptr_decoded *p) +{ + struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); + + f = &failed->devs[failed->nr++]; + f->dev = p->ptr.dev; + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else if (p->idx != f->idx) { + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else { + f->nr_failed++; + } +} + +/* + * returns true if p1 is better than p2: + */ +static inline bool ptr_better(struct bch_fs *c, + const struct extent_ptr_decoded p1, + const struct extent_ptr_decoded p2) +{ + if (likely(!p1.idx && !p2.idx)) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; + } + + if (bch2_force_reconstruct_read) + return p1.idx > p2.idx; + + return p1.idx < p2.idx; +} + +/* + * This picks a non-stale pointer, preferably from a device other than @avoid. + * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to + * other devices, it will still pick a pointer from avoid. + */ +int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_dev_io_failures *f; + struct bch_dev *ca; + int ret = 0; + + if (k.k->type == KEY_TYPE_error) + return -EIO; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Unwritten extent: no need to actually read, treat it as a + * hole and return 0s: + */ + if (p.ptr.unwritten) + return 0; + + ca = bch_dev_bkey_exists(c, p.ptr.dev); + + /* + * If there are any dirty pointers it's an error if we can't + * read: + */ + if (!ret && !p.ptr.cached) + ret = -EIO; + + if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + continue; + + f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + if (f) + p.idx = f->nr_failed < f->nr_retries + ? f->idx + : f->idx + 1; + + if (!p.idx && + !bch2_dev_is_readable(ca)) + p.idx++; + + if (bch2_force_reconstruct_read && + !p.idx && p.has_ec) + p.idx++; + + if (p.idx >= (unsigned) p.has_ec + 1) + continue; + + if (ret > 0 && !ptr_better(c, p, *pick)) + continue; + + *pick = p; + ret = 1; + } + + return ret; +} + +/* KEY_TYPE_btree_ptr: */ + +int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err, + btree_ptr_val_too_big, + "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); + + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; +} + +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); +} + +int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err, + btree_ptr_v2_val_too_big, + "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + + ret = bch2_bkey_ptrs_invalid(c, k, flags, err); +fsck_err: + return ret; +} + +void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + + prt_printf(out, "seq %llx written %u min_key %s", + le64_to_cpu(bp.v->seq), + le16_to_cpu(bp.v->sectors_written), + BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); + + bch2_bpos_to_text(out, bp.v->min_key); + prt_printf(out, " "); + bch2_bkey_ptrs_to_text(out, c, k); +} + +void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + unsigned big_endian, int write, + struct bkey_s k) +{ + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); + + compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id_is_extents(btree_id) && + !bkey_eq(bp.v->min_key, POS_MIN)) + bp.v->min_key = write + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); +} + +/* KEY_TYPE_extent: */ + +bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); + struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return false; + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + if (en_l < l_ptrs.end || en_r < r_ptrs.end) + return false; + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || + lp.ptr.gen != rp.ptr.gen || + lp.ptr.unwritten != rp.ptr.unwritten || + lp.has_ec != rp.has_ec) + return false; + + /* Extents may not straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp.ptr.dev); + if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + return false; + + if (lp.has_ec != rp.has_ec || + (lp.has_ec && + (lp.ec.block != rp.ec.block || + lp.ec.redundancy != rp.ec.redundancy || + lp.ec.idx != rp.ec.idx))) + return false; + + if (lp.crc.compression_type != rp.crc.compression_type || + lp.crc.nonce != rp.crc.nonce) + return false; + + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (lp.crc.live_size <= rp.crc.offset) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ + if (lp.crc.csum_type != rp.crc.csum_type || + lp.crc.nonce != rp.crc.nonce || + crc_is_compressed(lp.crc) || + !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + + if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || + rp.crc.offset) + return false; + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) + return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + if (crc_l.uncompressed_size + crc_r.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + use_right_ptr = false; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = + bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = + bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + use_right_ptr = false; + + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (crc_l.live_size <= crc_r.offset) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, + extent_entry_type(en_l)); + use_right_ptr = true; + } else { + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + int ret = 0; + + bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err, + reservation_key_nr_replicas_invalid, + "invalid nr_replicas (%u)", r.v->nr_replicas); +fsck_err: + return ret; +} + +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + prt_printf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} + +bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); + + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +/* Extent checksum entries: */ + +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) +{ + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); +} + +static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, + struct bch_extent_crc_unpacked n) +{ + return !crc_is_compressed(u) && + u.csum_type && + u.uncompressed_size > u.live_size && + bch2_csum_type_is_encryption(u.csum_type) == + bch2_csum_type_is_encryption(n.csum_type); +} + +bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, + struct bch_extent_crc_unpacked n) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + if (!n.csum_type) + return false; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (can_narrow_crc(crc, n)) + return true; + + return false; +} + +/* + * We're writing another replica for this extent, so while we've got the data in + * memory we'll be computing a new checksum for the currently live data. + * + * If there are other replicas we aren't moving, and they are checksummed but + * not compressed, we can modify them to point to only the data that is + * currently live (so that readers won't have to bounce) while we've got the + * checksum we need: + */ +bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked u; + struct extent_ptr_decoded p; + union bch_extent_entry *i; + bool ret = false; + + /* Find a checksum entry that covers only live data: */ + if (!n.csum_type) { + bkey_for_each_crc(&k->k, ptrs, u, i) + if (!crc_is_compressed(u) && + u.csum_type && + u.live_size == u.uncompressed_size) { + n = u; + goto found; + } + return false; + } +found: + BUG_ON(crc_is_compressed(n)); + BUG_ON(n.offset); + BUG_ON(n.live_size != k->k.size); + +restart_narrow_pointers: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) + if (can_narrow_crc(p.crc, n)) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); + p.ptr.offset += p.crc.offset; + p.crc = n; + bch2_extent_ptr_decoded_append(k, &p); + ret = true; + goto restart_narrow_pointers; + } + + return ret; +} + +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) +{ +#define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = (u64 __force) src.csum.lo; + dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields +} + +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; + + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); + + bch2_extent_crc_pack(crc, new, type); + + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); +} + +/* Generic code for keys with pointers: */ + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) +{ + return bch2_bkey_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation + ? bkey_s_c_to_reservation(k).v->nr_replicas + : bch2_bkey_dirty_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + if (k.k->type == KEY_TYPE_reservation) { + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + } else { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && !crc_is_compressed(p.crc); + } + + return ret; +} + +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ret = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && crc_is_compressed(p.crc)) + ret += p.crc.compressed_size; + + return ret; +} + +bool bch2_bkey_is_incompressible(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + return true; + return false; +} + +unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + unsigned replicas = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; + + if (p.has_ec) + replicas += p.ec.redundancy; + + replicas++; + + } + + return replicas; +} + +static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p) +{ + if (p->ptr.cached) + return 0; + + return p->has_ec + ? p->ec.redundancy + 1 + : ca->mi.durability; +} + +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + + return __extent_ptr_durability(ca, p); +} + +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev); + + if (ca->mi.state == BCH_MEMBER_STATE_failed) + return 0; + + return __extent_ptr_durability(ca, p); +} + +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, &p); + + return durability; +} + +static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) + durability += bch2_extent_ptr_durability(c, &p); + + return durability; +} + +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); + + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); +} + +void bch2_extent_ptr_decoded_append(struct bkey_i *k, + struct extent_ptr_decoded *p) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); + union bch_extent_entry *pos; + + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; + } + + bkey_for_each_crc(&k->k, ptrs, crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } + + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ptr)); + + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); + } +} + +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} + +/* + * Returns pointer to the next entry after the one being dropped: + */ +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { + break; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; + break; + } + } + + extent_entry_drop(k, entry); + + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) + break; + + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); + } + } + + return ret; +} + +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + bch2_bkey_drop_ptr_noerror(k, ptr); + + /* + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: + */ + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } + + return ret; +} + +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +} + +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); + + if (ptr) + bch2_bkey_drop_ptr_noerror(k, ptr); +} + +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == + (s64) m.offset - offset) + return true; + + return false; +} + +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) +{ + if (k1.k->type != k2.k->type) + return false; + + if (bkey_extent_is_direct_data(k1.k)) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; + + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + return false; + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; + } else { + /* KEY_TYPE_deleted, etc. */ + return true; + } +} + +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) +{ + struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); + union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; + + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return &entry2->ptr; + + return NULL; +} + +void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + union bch_extent_entry *ec = NULL; + + bkey_extent_entry_for_each(ptrs, entry) { + if (&entry->ptr == ptr) { + ptr->cached = true; + if (ec) + extent_entry_drop(k, ec); + return; + } + + if (extent_entry_is_stripe_ptr(entry)) + ec = entry; + else if (extent_entry_is_ptr(entry)) + ec = NULL; + } + + BUG(); +} + +/* + * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * + * Returns true if @k should be dropped entirely + * + * For existing keys, only called when btree nodes are being rewritten, not when + * they're merely being compacted/resorted in memory. + */ +bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + + return bkey_deleted(k.k); +} + +void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + bool first = true; + + if (c) + prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); + + bkey_extent_entry_for_each(ptrs, entry) { + if (!first) + prt_printf(out, " "); + + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: { + const struct bch_extent_ptr *ptr = entry_to_ptr(entry); + struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } + break; + } + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: { + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); + break; + } + case BCH_EXTENT_ENTRY_stripe_ptr: { + const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr; + + prt_printf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); + break; + } + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + prt_str(out, "rebalance: target "); + if (c) + bch2_target_to_text(out, c, r->target); + else + prt_printf(out, "%u", r->target); + prt_str(out, " compression "); + bch2_compression_opt_to_text(out, r->compression); + break; + } + default: + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; + } + + first = false; + } +} + +static int extent_ptr_invalid(struct bch_fs *c, + struct bkey_s_c k, + enum bkey_invalid_flags flags, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, + struct printbuf *err) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr2; + u64 bucket; + u32 bucket_offset; + struct bch_dev *ca; + int ret = 0; + + if (!bch2_dev_exists2(c, ptr->dev)) { + /* + * If we're in the write path this key might have already been + * overwritten, and we could be seeing a device that doesn't + * exist anymore due to racing with device removal: + */ + if (flags & BKEY_INVALID_WRITE) + return 0; + + bkey_fsck_err(c, err, ptr_to_invalid_device, + "pointer to invalid device (%u)", ptr->dev); + } + + ca = bch_dev_bkey_exists(c, ptr->dev); + bkey_for_each_ptr(ptrs, ptr2) + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err, + ptr_to_duplicate_device, + "multiple pointers to same device (%u)", ptr->dev); + + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + + bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err, + ptr_after_last_bucket, + "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); + bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err, + ptr_before_first_bucket, + "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); + bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err, + ptr_spans_multiple_buckets, + "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); +fsck_err: + return ret; +} + +int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; + unsigned nonce = UINT_MAX; + unsigned nr_ptrs = 0; + bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false; + int ret = 0; + + if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); + + bkey_extent_entry_for_each(ptrs, entry) { + bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err, + extent_ptrs_invalid_entry, + "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + + bkey_fsck_err_on(bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry), c, err, + btree_ptr_has_non_ptr, + "has non ptr field"); + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ret = extent_ptr_invalid(c, k, flags, &entry->ptr, + size_ondisk, false, err); + if (ret) + return ret; + + bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err, + ptr_cached_and_erasure_coded, + "cached, erasure coded ptr"); + + if (!entry->ptr.unwritten) + have_written = true; + else + have_unwritten = true; + + have_ec = false; + crc_since_last_ptr = false; + nr_ptrs++; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err, + ptr_crc_uncompressed_size_too_small, + "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err, + ptr_crc_csum_type_unknown, + "invalid checksum type"); + bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err, + ptr_crc_compression_type_unknown, + "invalid compression type"); + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) + bkey_fsck_err(c, err, ptr_crc_nonce_mismatch, + "incorrect nonce"); + } + + bkey_fsck_err_on(crc_since_last_ptr, c, err, + ptr_crc_redundant, + "redundant crc entry"); + crc_since_last_ptr = true; + + bkey_fsck_err_on(crc_is_encoded(crc) && + (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && + (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err, + ptr_crc_uncompressed_size_too_big, + "too large encoded extent"); + + size_ondisk = crc.compressed_size; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + bkey_fsck_err_on(have_ec, c, err, + ptr_stripe_redundant, + "redundant stripe entry"); + have_ec = true; + break; + case BCH_EXTENT_ENTRY_rebalance: { + const struct bch_extent_rebalance *r = &entry->rebalance; + + if (!bch2_compression_opt_valid(r->compression)) { + struct bch_compression_opt opt = __bch2_compression_decode(r->compression); + prt_printf(err, "invalid compression opt %u:%u", + opt.type, opt.level); + return -BCH_ERR_invalid_bkey; + } + break; + } + } + } + + bkey_fsck_err_on(!nr_ptrs, c, err, + extent_ptrs_no_ptrs, + "no ptrs"); + bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err, + extent_ptrs_too_many_ptrs, + "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX); + bkey_fsck_err_on(have_written && have_unwritten, c, err, + extent_ptrs_written_and_unwritten, + "extent with unwritten and written ptrs"); + bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err, + extent_ptrs_unwritten, + "has unwritten ptrs"); + bkey_fsck_err_on(crc_since_last_ptr, c, err, + extent_ptrs_redundant_crc, + "redundant crc entry"); + bkey_fsck_err_on(have_ec, c, err, + extent_ptrs_redundant_stripe, + "redundant stripe entry"); +fsck_err: + return ret; +} + +void bch2_ptr_swab(struct bkey_s k) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + u64 *d; + + for (d = (u64 *) ptrs.start; + d != (u64 *) ptrs.end; + d++) + *d = swab64(*d); + + for (entry = ptrs.start; + entry < ptrs.end; + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + case BCH_EXTENT_ENTRY_rebalance: + break; + } + } +} + +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) + return &entry->rebalance; + + return NULL; +} + +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, + unsigned target, unsigned compression) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned rewrite_ptrs = 0; + + if (compression) { + unsigned compression_type = bch2_compression_opt_to_type(compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned i = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + rewrite_ptrs = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= 1U << i; + i++; + } + } +incompressible: + if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + const struct bch_extent_ptr *ptr; + unsigned i = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) + rewrite_ptrs |= 1U << i; + i++; + } + } + + return rewrite_ptrs; +} + +bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + /* + * If it's an indirect extent, we don't delete the rebalance entry when + * done so that we know what options were applied - check if it still + * needs work done: + */ + if (r && + k.k->type == KEY_TYPE_reflink_v && + !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) + r = NULL; + + return r != NULL; +} + +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, + unsigned target, unsigned compression) +{ + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *r; + bool needs_rebalance; + + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + /* get existing rebalance entry: */ + r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + if (r) { + if (k.k->type == KEY_TYPE_reflink_v) { + /* + * indirect extents: existing options take precedence, + * so that we don't move extents back and forth if + * they're referenced by different inodes with different + * options: + */ + if (r->target) + target = r->target; + if (r->compression) + compression = r->compression; + } + + r->target = target; + r->compression = compression; + } + + needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); + + if (needs_rebalance && !r) { + union bch_extent_entry *new = bkey_val_end(k); + + new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; + new->rebalance.compression = compression; + new->rebalance.target = target; + new->rebalance.unused = 0; + k.k->u64s += extent_entry_u64s(new); + } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { + /* + * For indirect extents, don't delete the rebalance entry when + * we're finished so that we know we specifically moved it or + * compressed it to its current location/compression type + */ + extent_entry_drop(k, (union bch_extent_entry *) r); + } + + return 0; +} + +/* Generic extent code: */ + +int bch2_cut_front_s(struct bpos where, struct bkey_s k) +{ + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 sub; + + if (bkey_le(where, bkey_start_pos(k.k))) + return 0; + + EBUG_ON(bkey_gt(where, k.k->p)); + + sub = where.offset - bkey_start_offset(k.k); + + k.k->size -= sub; + + if (!k.k->size) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + bool seen_crc = false; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + if (!seen_crc) + entry->ptr.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.offset += sub; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + case BCH_EXTENT_ENTRY_rebalance: + break; + } + + if (extent_entry_is_crc(entry)) + seen_crc = true; + } + + break; + } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); + + le64_add_cpu(&p.v->idx, sub); + break; + } + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: { + void *p = bkey_inline_data_p(k); + unsigned bytes = bkey_inline_data_bytes(k.k); + + sub = min_t(u64, sub << 9, bytes); + + memmove(p, p + sub, bytes - sub); + + new_val_u64s -= sub >> 3; + break; + } + } + + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; +} + +int bch2_cut_back_s(struct bpos where, struct bkey_s k) +{ + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 len = 0; + + if (bkey_ge(where, k.k->p)) + return 0; + + EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); + + len = where.offset - bkey_start_offset(k.k); + + k.k->p.offset = where.offset; + k.k->size = len; + + if (!len) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } + + switch (k.k->type) { + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: + new_val_u64s = (bkey_inline_data_offset(k.k) + + min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; + break; + } + + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 index 000000000000..a2ce8a3be13c --- /dev/null +++ b/fs/bcachefs/extents.h @@ -0,0 +1,765 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_H +#define _BCACHEFS_EXTENTS_H + +#include "bcachefs.h" +#include "bkey.h" +#include "extents_types.h" + +struct bch_fs; +struct btree_trans; +enum bkey_invalid_flags; + +/* extent entries: */ + +#define extent_entry_last(_e) \ + ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) + +#define entry_to_ptr(_entry) \ +({ \ + EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ + \ + __builtin_choose_expr( \ + type_is_exact(_entry, const union bch_extent_entry *), \ + (const struct bch_extent_ptr *) (_entry), \ + (struct bch_extent_ptr *) (_entry)); \ +}) + +/* downcast, preserves const */ +#define to_entry(_entry) \ +({ \ + BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ + !type_is(_entry, struct bch_extent_ptr *) && \ + !type_is(_entry, struct bch_extent_stripe_ptr *)); \ + \ + __builtin_choose_expr( \ + (type_is_exact(_entry, const union bch_extent_crc *) || \ + type_is_exact(_entry, const struct bch_extent_ptr *) ||\ + type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ + (const union bch_extent_entry *) (_entry), \ + (union bch_extent_entry *) (_entry)); \ +}) + +#define extent_entry_next(_entry) \ + ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + +static inline unsigned +__extent_entry_type(const union bch_extent_entry *e) +{ + return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; +} + +static inline enum bch_extent_entry_type +extent_entry_type(const union bch_extent_entry *e) +{ + int ret = __ffs(e->type); + + EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); + + return ret; +} + +static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) +{ + switch (extent_entry_type(entry)) { +#define x(f, n) \ + case BCH_EXTENT_ENTRY_##f: \ + return sizeof(struct bch_extent_##f); + BCH_EXTENT_ENTRY_TYPES() +#undef x + default: + BUG(); + } +} + +static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) +{ + return extent_entry_bytes(entry) / sizeof(u64); +} + +static inline void __extent_entry_insert(struct bkey_i *k, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); + memcpy_u64s_small(dst, new, extent_entry_u64s(new)); +} + +static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *next = extent_entry_next(entry); + + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); + + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} + +static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; +} + +static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; +} + +static inline bool extent_entry_is_crc(const union bch_extent_entry *e) +{ + switch (extent_entry_type(e)) { + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + return true; + default: + return false; + } +} + +union bch_extent_crc { + u8 type; + struct bch_extent_crc32 crc32; + struct bch_extent_crc64 crc64; + struct bch_extent_crc128 crc128; +}; + +#define __entry_to_crc(_entry) \ + __builtin_choose_expr( \ + type_is_exact(_entry, const union bch_extent_entry *), \ + (const union bch_extent_crc *) (_entry), \ + (union bch_extent_crc *) (_entry)) + +#define entry_to_crc(_entry) \ +({ \ + EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ + \ + __entry_to_crc(_entry); \ +}) + +static inline struct bch_extent_crc_unpacked +bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) +{ +#define common_fields(_crc) \ + .csum_type = _crc.csum_type, \ + .compression_type = _crc.compression_type, \ + .compressed_size = _crc._compressed_size + 1, \ + .uncompressed_size = _crc._uncompressed_size + 1, \ + .offset = _crc.offset, \ + .live_size = k->size + + if (!crc) + return (struct bch_extent_crc_unpacked) { + .compressed_size = k->size, + .uncompressed_size = k->size, + .live_size = k->size, + }; + + switch (extent_entry_type(to_entry(crc))) { + case BCH_EXTENT_ENTRY_crc32: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { + common_fields(crc->crc32), + }; + + *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; + return ret; + } + case BCH_EXTENT_ENTRY_crc64: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { + common_fields(crc->crc64), + .nonce = crc->crc64.nonce, + .csum.lo = (__force __le64) crc->crc64.csum_lo, + }; + + *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; + + return ret; + } + case BCH_EXTENT_ENTRY_crc128: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { + common_fields(crc->crc128), + .nonce = crc->crc128.nonce, + .csum = crc->crc128.csum, + }; + + return ret; + } + default: + BUG(); + } +#undef common_fields +} + +static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) +{ + return (crc.compression_type != BCH_COMPRESSION_TYPE_none && + crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); +} + +static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc) +{ + return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc); +} + +/* bkey_ptrs: generically over any key type that has ptrs */ + +struct bkey_ptrs_c { + const union bch_extent_entry *start; + const union bch_extent_entry *end; +}; + +struct bkey_ptrs { + union bch_extent_entry *start; + union bch_extent_entry *end; +}; + +static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr: { + struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); + + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) + }; + } + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + + return (struct bkey_ptrs_c) { + e.v->start, + extent_entry_last(e) + }; + } + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + return (struct bkey_ptrs_c) { + to_entry(&s.v->ptrs[0]), + to_entry(&s.v->ptrs[s.v->nr_blocks]), + }; + } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } + case KEY_TYPE_btree_ptr_v2: { + struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); + + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) + }; + } + default: + return (struct bkey_ptrs_c) { NULL, NULL }; + } +} + +static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) +{ + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); + + return (struct bkey_ptrs) { + (void *) p.start, + (void *) p.end + }; +} + +#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ + for ((_entry) = (_start); \ + (_entry) < (_end); \ + (_entry) = extent_entry_next(_entry)) + +#define __bkey_ptr_next(_ptr, _end) \ +({ \ + typeof(_end) _entry; \ + \ + __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ + if (extent_entry_is_ptr(_entry)) \ + break; \ + \ + _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ +}) + +#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) + +#define bkey_extent_entry_for_each(_p, _entry) \ + bkey_extent_entry_for_each_from(_p, _entry, _p.start) + +#define __bkey_for_each_ptr(_start, _end, _ptr) \ + for ((_ptr) = (_start); \ + ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ + (_ptr)++) + +#define bkey_ptr_next(_p, _ptr) \ + __bkey_ptr_next(_ptr, (_p).end) + +#define bkey_for_each_ptr(_p, _ptr) \ + __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) + +#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ +({ \ + __label__ out; \ + \ + (_ptr).idx = 0; \ + (_ptr).has_ec = false; \ + \ + __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ + switch (extent_entry_type(_entry)) { \ + case BCH_EXTENT_ENTRY_ptr: \ + (_ptr).ptr = _entry->ptr; \ + goto out; \ + case BCH_EXTENT_ENTRY_crc32: \ + case BCH_EXTENT_ENTRY_crc64: \ + case BCH_EXTENT_ENTRY_crc128: \ + (_ptr).crc = bch2_extent_crc_unpack(_k, \ + entry_to_crc(_entry)); \ + break; \ + case BCH_EXTENT_ENTRY_stripe_ptr: \ + (_ptr).ec = _entry->stripe_ptr; \ + (_ptr).has_ec = true; \ + break; \ + default: \ + /* nothing */ \ + break; \ + } \ +out: \ + _entry < (_end); \ +}) + +#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ + for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ + (_entry) = _start; \ + __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ + (_entry) = extent_entry_next(_entry)) + +#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ + __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ + _ptr, _entry) + +#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ +({ \ + __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ + if (extent_entry_is_crc(_iter)) { \ + (_crc) = bch2_extent_crc_unpack(_k, \ + entry_to_crc(_iter)); \ + break; \ + } \ + \ + (_iter) < (_end); \ +}) + +#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ + for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ + (_iter) = (_start); \ + bkey_crc_next(_k, _start, _end, _crc, _iter); \ + (_iter) = extent_entry_next(_iter)) + +#define bkey_for_each_crc(_k, _p, _crc, _iter) \ + __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) + +/* Iterate over pointers in KEY_TYPE_extent: */ + +#define extent_for_each_entry_from(_e, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, \ + extent_entry_last(_e), _entry) + +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) + +#define extent_ptr_next(_e, _ptr) \ + __bkey_ptr_next(_ptr, extent_entry_last(_e)) + +#define extent_for_each_ptr(_e, _ptr) \ + __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) + +#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ + __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ + extent_entry_last(_e), _ptr, _entry) + +/* utility code common to all keys with pointers: */ + +void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *); +int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + struct bch_io_failures *, + struct extent_ptr_decoded *); + +/* KEY_TYPE_btree_ptr: */ + +int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); + +#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ +}) + +#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_v2_invalid, \ + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ + .min_val_size = 40, \ +}) + +/* KEY_TYPE_extent: */ + +bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +#define bch2_bkey_ops_extent ((struct bkey_ops) { \ + .key_invalid = bch2_bkey_ptrs_invalid, \ + .val_to_text = bch2_bkey_ptrs_to_text, \ + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ + .key_merge = bch2_extent_merge, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ +}) + +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +#define bch2_bkey_ops_reservation ((struct bkey_ops) { \ + .key_invalid = bch2_reservation_invalid, \ + .val_to_text = bch2_reservation_to_text, \ + .key_merge = bch2_reservation_merge, \ + .trans_trigger = bch2_trans_mark_reservation, \ + .atomic_trigger = bch2_mark_reservation, \ + .min_val_size = 8, \ +}) + +/* Extent checksum entries: */ + +bool bch2_can_narrow_extent_crcs(struct bkey_s_c, + struct bch_extent_crc_unpacked); +bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); +void bch2_extent_crc_append(struct bkey_i *, + struct bch_extent_crc_unpacked); + +/* Generic code for keys with pointers: */ + +static inline bool bkey_is_btree_ptr(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_direct_data(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_inline_data(const struct bkey *k) +{ + return k->type == KEY_TYPE_inline_data || + k->type == KEY_TYPE_indirect_inline_data; +} + +static inline unsigned bkey_inline_data_offset(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_inline_data: + return sizeof(struct bch_inline_data); + case KEY_TYPE_indirect_inline_data: + return sizeof(struct bch_indirect_inline_data); + default: + BUG(); + } +} + +static inline unsigned bkey_inline_data_bytes(const struct bkey *k) +{ + return bkey_val_bytes(k) - bkey_inline_data_offset(k); +} + +#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) + +static inline bool bkey_extent_is_data(const struct bkey *k) +{ + return bkey_extent_is_direct_data(k) || + bkey_extent_is_inline_data(k) || + k->type == KEY_TYPE_reflink_p; +} + +/* + * Should extent be counted under inode->i_sectors? + */ +static inline bool bkey_extent_is_allocation(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: + case KEY_TYPE_error: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->unwritten) + return true; + return false; +} + +static inline bool bkey_extent_is_reservation(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation || + bkey_extent_is_unwritten(k); +} + +static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(p, ptr) + ret.devs[ret.nr++] = ptr->dev; + + return ret; +} + +static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(p, ptr) + if (!ptr->cached) + ret.devs[ret.nr++] = ptr->dev; + + return ret; +} + +static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(p, ptr) + if (ptr->cached) + ret.devs[ret.nr++] = ptr->dev; + + return ret; +} + +static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return BCH_DATA_user; + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + BUG_ON(ptr < s.v->ptrs || + ptr >= s.v->ptrs + s.v->nr_blocks); + + return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity + : BCH_DATA_user; + } + default: + BUG(); + } +} + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); +bool bch2_bkey_is_incompressible(struct bkey_s_c); +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); + +unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); +unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); +unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); + +void bch2_bkey_drop_device(struct bkey_s, unsigned); +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); + +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); + +static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) +{ + return (void *) bch2_bkey_has_device_c(k.s_c, dev); +} + +bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); + +void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); + +static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) +{ + struct bch_extent_ptr *dest; + + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); + + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k)); + *dest = ptr; + k->k.u64s++; + break; + default: + BUG(); + } +} + +void bch2_extent_ptr_decoded_append(struct bkey_i *, + struct extent_ptr_decoded *); +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, + struct bch_extent_ptr *); +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, + struct bch_extent_ptr *); + +#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ +do { \ + struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ + \ + _ptr = &_ptrs.start->ptr; \ + \ + while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ + if (_cond) { \ + _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ + _ptrs = bch2_bkey_ptrs(_k); \ + continue; \ + } \ + \ + (_ptr)++; \ + } \ +} while (0) + +bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_extent_ptr, u64); +bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); + +void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); + +bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); + +void bch2_ptr_swab(struct bkey_s); + +const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, + unsigned, unsigned); +bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); + +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, + unsigned, unsigned); + +/* Generic extent code: */ + +enum bch_extent_overlap { + BCH_EXTENT_OVERLAP_ALL = 0, + BCH_EXTENT_OVERLAP_BACK = 1, + BCH_EXTENT_OVERLAP_FRONT = 2, + BCH_EXTENT_OVERLAP_MIDDLE = 3, +}; + +/* Returns how k overlaps with m */ +static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, + const struct bkey *m) +{ + int cmp1 = bkey_lt(k->p, m->p); + int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); + + return (cmp1 << 1) + cmp2; +} + +int bch2_cut_front_s(struct bpos, struct bkey_s); +int bch2_cut_back_s(struct bpos, struct bkey_s); + +static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) +{ + bch2_cut_front_s(where, bkey_i_to_s(k)); +} + +static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) +{ + bch2_cut_back_s(where, bkey_i_to_s(k)); +} + +/** + * bch_key_resize - adjust size of @k + * + * bkey_start_offset(k) will be preserved, modifies where the extent ends + */ +static inline void bch2_key_resize(struct bkey *k, unsigned new_size) +{ + k->p.offset -= k->size; + k->p.offset += new_size; + k->size = new_size; +} + +#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h new file mode 100644 index 000000000000..43d6c341ecca --- /dev/null +++ b/fs/bcachefs/extents_types.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_TYPES_H +#define _BCACHEFS_EXTENTS_TYPES_H + +#include "bcachefs_format.h" + +struct bch_extent_crc_unpacked { + u32 compressed_size; + u32 uncompressed_size; + u32 live_size; + + u8 csum_type; + u8 compression_type; + + u16 offset; + + u16 nonce; + + struct bch_csum csum; +}; + +struct extent_ptr_decoded { + unsigned idx; + bool has_ec; + struct bch_extent_crc_unpacked crc; + struct bch_extent_ptr ptr; + struct bch_extent_stripe_ptr ec; +}; + +struct bch_io_failures { + u8 nr; + struct bch_dev_io_failures { + u8 dev; + u8 idx; + u8 nr_failed; + u8 nr_retries; + } devs[BCH_REPLICAS_MAX]; +}; + +#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h new file mode 100644 index 000000000000..05429c9631cd --- /dev/null +++ b/fs/bcachefs/eytzinger.h @@ -0,0 +1,281 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _EYTZINGER_H +#define _EYTZINGER_H + +#include <linux/bitops.h> +#include <linux/log2.h> + +#include "util.h" + +/* + * Traversal for trees in eytzinger layout - a full binary tree layed out in an + * array + */ + +/* + * One based indexing version: + * + * With one based indexing each level of the tree starts at a power of two - + * good for cacheline alignment: + */ + +static inline unsigned eytzinger1_child(unsigned i, unsigned child) +{ + EBUG_ON(child > 1); + + return (i << 1) + child; +} + +static inline unsigned eytzinger1_left_child(unsigned i) +{ + return eytzinger1_child(i, 0); +} + +static inline unsigned eytzinger1_right_child(unsigned i) +{ + return eytzinger1_child(i, 1); +} + +static inline unsigned eytzinger1_first(unsigned size) +{ + return rounddown_pow_of_two(size); +} + +static inline unsigned eytzinger1_last(unsigned size) +{ + return rounddown_pow_of_two(size + 1) - 1; +} + +/* + * eytzinger1_next() and eytzinger1_prev() have the nice properties that + * + * eytzinger1_next(0) == eytzinger1_first()) + * eytzinger1_prev(0) == eytzinger1_last()) + * + * eytzinger1_prev(eytzinger1_first()) == 0 + * eytzinger1_next(eytzinger1_last()) == 0 + */ + +static inline unsigned eytzinger1_next(unsigned i, unsigned size) +{ + EBUG_ON(i > size); + + if (eytzinger1_right_child(i) <= size) { + i = eytzinger1_right_child(i); + + i <<= __fls(size + 1) - __fls(i); + i >>= i > size; + } else { + i >>= ffz(i) + 1; + } + + return i; +} + +static inline unsigned eytzinger1_prev(unsigned i, unsigned size) +{ + EBUG_ON(i > size); + + if (eytzinger1_left_child(i) <= size) { + i = eytzinger1_left_child(i) + 1; + + i <<= __fls(size + 1) - __fls(i); + i -= 1; + i >>= i > size; + } else { + i >>= __ffs(i) + 1; + } + + return i; +} + +static inline unsigned eytzinger1_extra(unsigned size) +{ + return (size + 1 - rounddown_pow_of_two(size)) << 1; +} + +static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, + unsigned extra) +{ + unsigned b = __fls(i); + unsigned shift = __fls(size) - b; + int s; + + EBUG_ON(!i || i > size); + + i ^= 1U << b; + i <<= 1; + i |= 1; + i <<= shift; + + /* + * sign bit trick: + * + * if (i > extra) + * i -= (i - extra) >> 1; + */ + s = extra - i; + i += (s >> 1) & (s >> 31); + + return i; +} + +static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, + unsigned extra) +{ + unsigned shift; + int s; + + EBUG_ON(!i || i > size); + + /* + * sign bit trick: + * + * if (i > extra) + * i += i - extra; + */ + s = extra - i; + i -= s & (s >> 31); + + shift = __ffs(i); + + i >>= shift + 1; + i |= 1U << (__fls(size) - shift); + + return i; +} + +static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) +{ + return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); +} + +static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) +{ + return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); +} + +#define eytzinger1_for_each(_i, _size) \ + for ((_i) = eytzinger1_first((_size)); \ + (_i) != 0; \ + (_i) = eytzinger1_next((_i), (_size))) + +/* Zero based indexing version: */ + +static inline unsigned eytzinger0_child(unsigned i, unsigned child) +{ + EBUG_ON(child > 1); + + return (i << 1) + 1 + child; +} + +static inline unsigned eytzinger0_left_child(unsigned i) +{ + return eytzinger0_child(i, 0); +} + +static inline unsigned eytzinger0_right_child(unsigned i) +{ + return eytzinger0_child(i, 1); +} + +static inline unsigned eytzinger0_first(unsigned size) +{ + return eytzinger1_first(size) - 1; +} + +static inline unsigned eytzinger0_last(unsigned size) +{ + return eytzinger1_last(size) - 1; +} + +static inline unsigned eytzinger0_next(unsigned i, unsigned size) +{ + return eytzinger1_next(i + 1, size) - 1; +} + +static inline unsigned eytzinger0_prev(unsigned i, unsigned size) +{ + return eytzinger1_prev(i + 1, size) - 1; +} + +static inline unsigned eytzinger0_extra(unsigned size) +{ + return eytzinger1_extra(size); +} + +static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, + unsigned extra) +{ + return __eytzinger1_to_inorder(i + 1, size, extra) - 1; +} + +static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, + unsigned extra) +{ + return __inorder_to_eytzinger1(i + 1, size, extra) - 1; +} + +static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) +{ + return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); +} + +static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) +{ + return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); +} + +#define eytzinger0_for_each(_i, _size) \ + for ((_i) = eytzinger0_first((_size)); \ + (_i) != -1; \ + (_i) = eytzinger0_next((_i), (_size))) + +typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); + +/* return greatest node <= @search, or -1 if not found */ +static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, + eytzinger_cmp_fn cmp, const void *search) +{ + unsigned i, n = 0; + + if (!nr) + return -1; + + do { + i = n; + n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + } while (n < nr); + + if (n & 1) { + /* @i was greater than @search, return previous node: */ + + if (i == eytzinger0_first(nr)) + return -1; + + return eytzinger0_prev(i, nr); + } else { + return i; + } +} + +#define eytzinger0_find(base, nr, size, _cmp, search) \ +({ \ + void *_base = (base); \ + void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ + int _res; \ + \ + while (_i < _nr && \ + (_res = _cmp(_search, _base + _i * _size, _size))) \ + _i = eytzinger0_child(_i, _res > 0); \ + _i; \ +}) + +void eytzinger0_sort(void *, size_t, size_t, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); + +#endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h new file mode 100644 index 000000000000..66b945be10c2 --- /dev/null +++ b/fs/bcachefs/fifo.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FIFO_H +#define _BCACHEFS_FIFO_H + +#include "util.h" + +#define FIFO(type) \ +struct { \ + size_t front, back, size, mask; \ + type *data; \ +} + +#define DECLARE_FIFO(type, name) FIFO(type) name + +#define fifo_buf_size(fifo) \ + ((fifo)->size \ + ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ + : 0) + +#define init_fifo(fifo, _size, _gfp) \ +({ \ + (fifo)->front = (fifo)->back = 0; \ + (fifo)->size = (_size); \ + (fifo)->mask = (fifo)->size \ + ? roundup_pow_of_two((fifo)->size) - 1 \ + : 0; \ + (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ +}) + +#define free_fifo(fifo) \ +do { \ + kvpfree((fifo)->data, fifo_buf_size(fifo)); \ + (fifo)->data = NULL; \ +} while (0) + +#define fifo_swap(l, r) \ +do { \ + swap((l)->front, (r)->front); \ + swap((l)->back, (r)->back); \ + swap((l)->size, (r)->size); \ + swap((l)->mask, (r)->mask); \ + swap((l)->data, (r)->data); \ +} while (0) + +#define fifo_move(dest, src) \ +do { \ + typeof(*((dest)->data)) _t; \ + while (!fifo_full(dest) && \ + fifo_pop(src, _t)) \ + fifo_push(dest, _t); \ +} while (0) + +#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) +#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) + +#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) +#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) + +#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) +#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) + +#define fifo_entry_idx_abs(fifo, p) \ + ((((p) >= &fifo_peek_front(fifo) \ + ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ + (((p) - (fifo)->data))) + +#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) +#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask]) + +#define fifo_push_back_ref(f) \ + (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) + +#define fifo_push_front_ref(f) \ + (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) + +#define fifo_push_back(fifo, new) \ +({ \ + typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ + if (_r) \ + *_r = (new); \ + _r != NULL; \ +}) + +#define fifo_push_front(fifo, new) \ +({ \ + typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ + if (_r) \ + *_r = (new); \ + _r != NULL; \ +}) + +#define fifo_pop_front(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) \ + (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ + _r; \ +}) + +#define fifo_pop_back(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) \ + (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ + _r; \ +}) + +#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) +#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) +#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) +#define fifo_peek(fifo) fifo_peek_front(fifo) + +#define fifo_for_each_entry(_entry, _fifo, _iter) \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ + ((_iter != (_fifo)->back) && \ + (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ + (_iter)++) + +#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ + ((_iter != (_fifo)->back) && \ + (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ + (_iter)++) + +#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 index 000000000000..4496cf91a4c1 --- /dev/null +++ b/fs/bcachefs/fs-common.c @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "acl.h" +#include "btree_update.h" +#include "dirent.h" +#include "fs-common.h" +#include "inode.h" +#include "subvolume.h" +#include "xattr.h" + +#include <linux/posix_acl.h> + +static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) +{ + return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; +} + +int bch2_create_trans(struct btree_trans *trans, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *new_inode, + const struct qstr *name, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct posix_acl *default_acl, + struct posix_acl *acl, + subvol_inum snapshot_src, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + subvol_inum new_inum = dir; + u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); + u64 dir_target; + u32 snapshot; + unsigned dir_type = mode_to_type(mode); + int ret; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + goto err; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + + if (!(flags & BCH_CREATE_SNAPSHOT)) { + /* Normal create path - allocate a new inode: */ + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (flags & BCH_CREATE_TMPFILE) + new_inode->bi_flags |= BCH_INODE_unlinked; + + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); + if (ret) + goto err; + + snapshot_src = (subvol_inum) { 0 }; + } else { + /* + * Creating a snapshot - we're not allocating a new inode, but + * we do have to lookup the root inode of the subvolume we're + * snapshotting and update it (in the new snapshot): + */ + + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, + BTREE_ITER_CACHED, &s); + if (ret) + goto err; + + snapshot_src.inum = le64_to_cpu(s.inode); + } + + ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (new_inode->bi_subvol != snapshot_src.subvol) { + /* Not a subvolume root: */ + ret = -EINVAL; + goto err; + } + + /* + * If we're not root, we have to own the subvolume being + * snapshotted: + */ + if (uid && new_inode->bi_uid != uid) { + ret = -EPERM; + goto err; + } + + flags |= BCH_CREATE_SUBVOL; + } + + new_inum.inum = new_inode->bi_inum; + dir_target = new_inode->bi_inum; + + if (flags & BCH_CREATE_SUBVOL) { + u32 new_subvol, dir_snapshot; + + ret = bch2_subvolume_create(trans, new_inode->bi_inum, + snapshot_src.subvol, + &new_subvol, &snapshot, + (flags & BCH_CREATE_SNAPSHOT_RO) != 0); + if (ret) + goto err; + + new_inode->bi_parent_subvol = dir.subvol; + new_inode->bi_subvol = new_subvol; + new_inum.subvol = new_subvol; + dir_target = new_subvol; + dir_type = DT_SUBVOL; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(&dir_iter); + if (ret) + goto err; + } + + if (!(flags & BCH_CREATE_SNAPSHOT)) { + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + goto err; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + acl, ACL_TYPE_ACCESS); + if (ret) + goto err; + } + } + + if (!(flags & BCH_CREATE_TMPFILE)) { + struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); + u64 dir_offset; + + if (is_subdir_for_nlink(new_inode)) + dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; + + ret = bch2_inode_write(trans, &dir_iter, dir_u); + if (ret) + goto err; + + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, + &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } + } + + inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + + ret = bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, new_inode); +err: + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dir_iter); + return ret; +} + +int bch2_link_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, + subvol_inum inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(c); + u64 dir_offset = 0; + int ret; + + if (dir.subvol != inum.subvol) + return -EXDEV; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + goto err; + + inode_u->bi_ctime = now; + ret = bch2_inode_nlink_inc(inode_u); + if (ret) + return ret; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + + if (bch2_reinherit_attrs(inode_u, dir_u)) { + ret = -EXDEV; + goto err; + } + + dir_u->bi_mtime = dir_u->bi_ctime = now; + + dir_hash = bch2_hash_info_init(c, dir_u); + + ret = bch2_dirent_create(trans, dir, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum.inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + inode_u->bi_dir = dir.inum; + inode_u->bi_dir_offset = dir_offset; + } + + ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); +err: + bch2_trans_iter_exit(trans, &dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); + return ret; +} + +int bch2_unlink_trans(struct btree_trans *trans, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, + const struct qstr *name, + bool deleting_snapshot) +{ + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; + struct btree_iter dirent_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + subvol_inum inum; + u64 now = bch2_current_time(c); + struct bkey_s_c k; + int ret; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + + dir_hash = bch2_hash_info_init(c, dir_u); + + ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); + if (ret) + goto err; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + + if (deleting_snapshot && !inode_u->bi_subvol) { + ret = -BCH_ERR_ENOENT_not_subvol; + goto err; + } + + if (deleting_snapshot || inode_u->bi_subvol) { + ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); + if (ret) + goto err; + + k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + /* + * If we're deleting a subvolume, we need to really delete the + * dirent, not just emit a whiteout in the current snapshot: + */ + bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; + } else { + bch2_inode_nlink_dec(trans, inode_u); + } + + if (inode_u->bi_dir == dirent_iter.pos.inode && + inode_u->bi_dir_offset == dirent_iter.pos.offset) { + inode_u->bi_dir = 0; + inode_u->bi_dir_offset = 0; + } + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); +err: + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dirent_iter); + bch2_trans_iter_exit(trans, &dir_iter); + return ret; +} + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, + struct bch_inode_unpacked *src_u) +{ + u64 src, dst; + unsigned id; + bool ret = false; + + for (id = 0; id < Inode_opt_nr; id++) { + /* Skip attributes that were explicitly set on this inode */ + if (dst_u->bi_fields_set & (1 << id)) + continue; + + src = bch2_inode_opt_get(src_u, id); + dst = bch2_inode_opt_get(dst_u, id); + + if (src == dst) + continue; + + bch2_inode_opt_set(dst_u, id, src); + ret = true; + } + + return ret; +} + +int bch2_rename_trans(struct btree_trans *trans, + subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, + subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, + struct bch_inode_unpacked *src_inode_u, + struct bch_inode_unpacked *dst_inode_u, + const struct qstr *src_name, + const struct qstr *dst_name, + enum bch_rename_mode mode) +{ + struct bch_fs *c = trans->c; + struct btree_iter src_dir_iter = { NULL }; + struct btree_iter dst_dir_iter = { NULL }; + struct btree_iter src_inode_iter = { NULL }; + struct btree_iter dst_inode_iter = { NULL }; + struct bch_hash_info src_hash, dst_hash; + subvol_inum src_inum, dst_inum; + u64 src_offset, dst_offset; + u64 now = bch2_current_time(c); + int ret; + + ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, + BTREE_ITER_INTENT); + if (ret) + goto err; + + src_hash = bch2_hash_info_init(c, src_dir_u); + + if (dst_dir.inum != src_dir.inum || + dst_dir.subvol != src_dir.subvol) { + ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); + if (ret) + goto err; + + dst_hash = bch2_hash_info_init(c, dst_dir_u); + } else { + dst_dir_u = src_dir_u; + dst_hash = src_hash; + } + + ret = bch2_dirent_rename(trans, + src_dir, &src_hash, + dst_dir, &dst_hash, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, + mode); + if (ret) + goto err; + + ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (dst_inum.inum) { + ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, + BTREE_ITER_INTENT); + if (ret) + goto err; + } + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; + + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } + + if (mode == BCH_RENAME_OVERWRITE && + dst_inode_u->bi_dir == dst_dir_u->bi_inum && + dst_inode_u->bi_dir_offset == src_offset) { + dst_inode_u->bi_dir = 0; + dst_inode_u->bi_dir_offset = 0; + } + } + + if (mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(src_inode_u->bi_mode) != + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -ENOTDIR; + goto err; + } + + if (S_ISDIR(dst_inode_u->bi_mode) && + bch2_empty_dir_trans(trans, dst_inum)) { + ret = -ENOTEMPTY; + goto err; + } + } + + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } + + if (is_subdir_for_nlink(src_inode_u)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } + + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { + dst_dir_u->bi_nlink--; + src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; + } + + if (mode == BCH_RENAME_OVERWRITE) + bch2_inode_nlink_dec(trans, dst_inode_u); + + src_dir_u->bi_mtime = now; + src_dir_u->bi_ctime = now; + + if (src_dir.inum != dst_dir.inum) { + dst_dir_u->bi_mtime = now; + dst_dir_u->bi_ctime = now; + } + + src_inode_u->bi_ctime = now; + + if (dst_inum.inum) + dst_inode_u->bi_ctime = now; + + ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: + (src_dir.inum != dst_dir.inum + ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) + : 0) ?: + bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: + (dst_inum.inum + ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) + : 0); +err: + bch2_trans_iter_exit(trans, &dst_inode_iter); + bch2_trans_iter_exit(trans, &src_inode_iter); + bch2_trans_iter_exit(trans, &dst_dir_iter); + bch2_trans_iter_exit(trans, &src_dir_iter); + return ret; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h new file mode 100644 index 000000000000..dde237859514 --- /dev/null +++ b/fs/bcachefs/fs-common.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_COMMON_H +#define _BCACHEFS_FS_COMMON_H + +struct posix_acl; + +#define BCH_CREATE_TMPFILE (1U << 0) +#define BCH_CREATE_SUBVOL (1U << 1) +#define BCH_CREATE_SNAPSHOT (1U << 2) +#define BCH_CREATE_SNAPSHOT_RO (1U << 3) + +int bch2_create_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + uid_t, gid_t, umode_t, dev_t, + struct posix_acl *, + struct posix_acl *, + subvol_inum, unsigned); + +int bch2_link_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + const struct qstr *); + +int bch2_unlink_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, bool); + +int bch2_rename_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + const struct qstr *, + enum bch_rename_mode); + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c new file mode 100644 index 000000000000..52f0e7acda3d --- /dev/null +++ b/fs/bcachefs/fs-io-buffered.c @@ -0,0 +1,1106 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" +#include "io_read.h" +#include "io_write.h" + +#include <linux/backing-dev.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> + +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + +/* readpage(s): */ + +static void bch2_readpages_end_io(struct bio *bio) +{ + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) { + if (!bio->bi_status) { + folio_mark_uptodate(fi.folio); + } else { + folio_clear_uptodate(fi.folio); + folio_set_error(fi.folio); + } + folio_unlock(fi.folio); + } + + bio_put(bio); +} + +struct readpages_iter { + struct address_space *mapping; + unsigned idx; + folios folios; +}; + +static int readpages_iter_init(struct readpages_iter *iter, + struct readahead_control *ractl) +{ + struct folio **fi; + int ret; + + memset(iter, 0, sizeof(*iter)); + + iter->mapping = ractl->mapping; + + ret = bch2_filemap_get_contig_folios_d(iter->mapping, + ractl->_index << PAGE_SHIFT, + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, + 0, mapping_gfp_mask(iter->mapping), + &iter->folios); + if (ret) + return ret; + + darray_for_each(iter->folios, fi) { + ractl->_nr_pages -= 1U << folio_order(*fi); + __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); + folio_put(*fi); + folio_put(*fi); + } + + return 0; +} + +static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) +{ + if (iter->idx >= iter->folios.nr) + return NULL; + return iter->folios.data[iter->idx]; +} + +static inline void readpage_iter_advance(struct readpages_iter *iter) +{ + iter->idx++; +} + +static bool extent_partial_reads_expensive(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc.csum_type || crc.compression_type) + return true; + return false; +} + +static int readpage_bio_extend(struct btree_trans *trans, + struct readpages_iter *iter, + struct bio *bio, + unsigned sectors_this_extent, + bool get_more) +{ + /* Don't hold btree locks while allocating memory: */ + bch2_trans_unlock(trans); + + while (bio_sectors(bio) < sectors_this_extent && + bio->bi_vcnt < bio->bi_max_vecs) { + struct folio *folio = readpage_iter_peek(iter); + int ret; + + if (folio) { + readpage_iter_advance(iter); + } else { + pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; + + if (!get_more) + break; + + folio = xa_load(&iter->mapping->i_pages, folio_offset); + if (folio && !xa_is_value(folio)) + break; + + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + if (!folio) + break; + + if (!__bch2_folio_create(folio, GFP_KERNEL)) { + folio_put(folio); + break; + } + + ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); + if (ret) { + __bch2_folio_release(folio); + folio_put(folio); + break; + } + + folio_put(folio); + } + + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); + + BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); + } + + return bch2_trans_relock(trans); +} + +static void bchfs_read(struct btree_trans *trans, + struct bch_read_bio *rbio, + subvol_inum inum, + struct readpages_iter *readpages_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_buf sk; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE; + u32 snapshot; + int ret = 0; + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + sectors = min(sectors, k.k->size - offset_into_extent); + + if (readpages_iter) { + ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); + if (ret) + break; + } + + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; + swap(rbio->bio.bi_iter.bi_size, bytes); + + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + iter.pos.inode, + iter.pos.offset << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); + } + + bch2_bkey_buf_exit(&sk, c); +} + +void bch2_readahead(struct readahead_control *ractl) +{ + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct btree_trans *trans = bch2_trans_get(c); + struct folio *folio; + struct readpages_iter readpages_iter; + int ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + ret = readpages_iter_init(&readpages_iter, ractl); + BUG_ON(ret); + + bch2_pagecache_add_get(inode); + + while ((folio = readpage_iter_peek(&readpages_iter))) { + unsigned n = min_t(unsigned, + readpages_iter.folios.nr - + readpages_iter.idx, + BIO_MAX_VECS); + struct bch_read_bio *rbio = + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_KERNEL, &c->bio_read), + opts); + + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bchfs_read(trans, rbio, inode_inum(inode), + &readpages_iter); + bch2_trans_unlock(trans); + } + + bch2_pagecache_add_put(inode); + + bch2_trans_put(trans); + darray_exit(&readpages_iter.folios); +} + +static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum, struct folio *folio) +{ + bch2_folio_create(folio, __GFP_NOFAIL); + + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0)); +} + +static void bch2_read_single_folio_end_io(struct bio *bio) +{ + complete(bio->bi_private); +} + +int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_read_bio *rbio; + struct bch_io_opts opts; + int ret; + DECLARE_COMPLETION_ONSTACK(done); + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), + opts); + rbio->bio.bi_private = &done; + rbio->bio.bi_end_io = bch2_read_single_folio_end_io; + + __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + wait_for_completion(&done); + + ret = blk_status_to_errno(rbio->bio.bi_status); + bio_put(&rbio->bio); + + if (ret < 0) + return ret; + + folio_mark_uptodate(folio); + return 0; +} + +int bch2_read_folio(struct file *file, struct folio *folio) +{ + int ret; + + ret = bch2_read_single_folio(folio, folio->mapping); + folio_unlock(folio); + return bch2_err_class(ret); +} + +/* writepages: */ + +struct bch_writepage_io { + struct bch_inode_info *inode; + + /* must be last: */ + struct bch_write_op op; +}; + +struct bch_writepage_state { + struct bch_writepage_io *io; + struct bch_io_opts opts; + struct bch_folio_sector *tmp; + unsigned tmp_sectors; +}; + +static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct bch_writepage_state ret = { 0 }; + + bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); + return ret; +} + +/* + * Determine when a writepage io is full. We have to limit writepage bios to a + * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to + * what the bounce path in bch2_write_extent() can handle. In theory we could + * loosen this restriction for non-bounce I/O, but we don't have that context + * here. Ideally, we can up this limit and make it configurable in the future + * when the bounce path can be enhanced to accommodate larger source bios. + */ +static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) +{ + struct bio *bio = &io->op.wbio.bio; + return bio_full(bio, len) || + (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); +} + +static void bch2_writepage_io_done(struct bch_write_op *op) +{ + struct bch_writepage_io *io = + container_of(op, struct bch_writepage_io, op); + struct bch_fs *c = io->op.c; + struct bio *bio = &io->op.wbio.bio; + struct folio_iter fi; + unsigned i; + + if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, -EIO); + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + /* + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + */ + WARN_ON_ONCE(io->op.i_sectors_delta > 0); + + /* + * (error (due to going RO) halfway through a page can screw that up + * slightly) + * XXX wtf? + BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); + */ + + /* + * PageWriteback is effectively our ref on the inode - fixup i_blocks + * before calling end_page_writeback: + */ + bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s = __bch2_folio(fi.folio); + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(fi.folio); + } + + bio_put(&io->op.wbio.bio); +} + +static void bch2_writepage_do_io(struct bch_writepage_state *w) +{ + struct bch_writepage_io *io = w->io; + + w->io = NULL; + closure_call(&io->op.cl, bch2_write, NULL, NULL); +} + +/* + * Get a bch_writepage_io and add @page to it - appending to an existing one if + * possible, else allocating a new one: + */ +static void bch2_writepage_io_alloc(struct bch_fs *c, + struct writeback_control *wbc, + struct bch_writepage_state *w, + struct bch_inode_info *inode, + u64 sector, + unsigned nr_replicas) +{ + struct bch_write_op *op; + + w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, + REQ_OP_WRITE, + GFP_KERNEL, + &c->writepage_bioset), + struct bch_writepage_io, op.wbio.bio); + + w->io->inode = inode; + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; + op->nr_replicas = nr_replicas; + op->res.nr_replicas = nr_replicas; + op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; + op->pos = POS(inode->v.i_ino, sector); + op->end_io = bch2_writepage_io_done; + op->devs_need_flush = &inode->ei_devs_need_flush; + op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); +} + +static int __bch2_writepage(struct folio *folio, + struct writeback_control *wbc, + void *data) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_writepage_state *w = data; + struct bch_folio *s; + unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; + loff_t i_size = i_size_read(&inode->v); + int ret; + + EBUG_ON(!folio_test_uptodate(folio)); + + /* Is the folio fully inside i_size? */ + if (folio_end_pos(folio) <= i_size) + goto do_io; + + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); + return 0; + } + + /* + * The folio straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the folio size. For a file that is not a multiple of + * the folio size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + folio_zero_segment(folio, + i_size - folio_pos(folio), + folio_size(folio)); +do_io: + f_sectors = folio_sectors(folio); + s = bch2_folio(folio); + + if (f_sectors > w->tmp_sectors) { + kfree(w->tmp); + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp_sectors = f_sectors; + } + + /* + * Things get really hairy with errors during writeback: + */ + ret = bch2_get_folio_disk_reservation(c, inode, folio, false); + BUG_ON(ret); + + /* Before unlocking the page, get copy of reservations: */ + spin_lock(&s->lock); + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + nr_replicas_this_write = + min_t(unsigned, nr_replicas_this_write, + s->s[i].nr_replicas + + s->s[i].replicas_reserved); + } + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + s->s[i].nr_replicas = w->opts.compression + ? 0 : nr_replicas_this_write; + + s->s[i].replicas_reserved = 0; + bch2_folio_sector_set(folio, s, i, SECTOR_allocated); + } + spin_unlock(&s->lock); + + BUG_ON(atomic_read(&s->write_count)); + atomic_set(&s->write_count, 1); + + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + + folio_unlock(folio); + + offset = 0; + while (1) { + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; + u64 sector; + + while (offset < f_sectors && + w->tmp[offset].state < SECTOR_dirty) + offset++; + + if (offset == f_sectors) + break; + + while (offset + sectors < f_sectors && + w->tmp[offset + sectors].state >= SECTOR_dirty) { + reserved_sectors += w->tmp[offset + sectors].replicas_reserved; + dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; + sectors++; + } + BUG_ON(!sectors); + + sector = folio_sector(folio) + offset; + + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || + bch_io_full(w->io, sectors << 9) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) + bch2_writepage_do_io(w); + + if (!w->io) + bch2_writepage_io_alloc(c, wbc, w, inode, sector, + nr_replicas_this_write); + + atomic_inc(&s->write_count); + + BUG_ON(inode != w->io->inode); + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, + sectors << 9, offset << 9)); + + /* Check for writing past i_size: */ + WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c)) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + "writing past i_size: %llu > %llu (unrounded %llu)\n", + bio_end_sector(&w->io->op.wbio.bio) << 9, + round_up(i_size, block_bytes(c)), + i_size); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; + w->io->op.new_i_size = i_size; + + offset += sectors; + } + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(folio); + + return 0; +} + +int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct bch_fs *c = mapping->host->i_sb->s_fs_info; + struct bch_writepage_state w = + bch_writepage_state_init(c, to_bch_ei(mapping->host)); + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); + if (w.io) + bch2_writepage_do_io(&w); + blk_finish_plug(&plug); + kfree(w.tmp); + return bch2_err_class(ret); +} + +/* buffered writes: */ + +int bch2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res; + struct folio *folio; + unsigned offset; + int ret = -ENOMEM; + + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + bch2_folio_reservation_init(c, inode, res); + *fsdata = res; + + bch2_pagecache_add_get(inode); + + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, + FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, + mapping_gfp_mask(mapping)); + if (IS_ERR_OR_NULL(folio)) + goto err_unlock; + + offset = pos - folio_pos(folio); + len = min_t(size_t, len, folio_end_pos(folio) - pos); + + if (folio_test_uptodate(folio)) + goto out; + + /* If we're writing entire folio, don't need to read it in first: */ + if (!offset && len == folio_size(folio)) + goto out; + + if (!offset && pos + len >= inode->v.i_size) { + folio_zero_segment(folio, len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } + + if (folio_pos(folio) >= inode->v.i_size) { + folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } +readpage: + ret = bch2_read_single_folio(folio, mapping); + if (ret) + goto err; +out: + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto err; + + ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); + if (ret) { + if (!folio_test_uptodate(folio)) { + /* + * If the folio hasn't been read in, we won't know if we + * actually need a reservation - we don't actually need + * to read here, we just need to check if the folio is + * fully backed by uncompressed data: + */ + goto readpage; + } + + goto err; + } + + *pagep = &folio->page; + return 0; +err: + folio_unlock(folio); + folio_put(folio); + *pagep = NULL; +err_unlock: + bch2_pagecache_add_put(inode); + kfree(res); + *fsdata = NULL; + return bch2_err_class(ret); +} + +int bch2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res = fsdata; + struct folio *folio = page_folio(page); + unsigned offset = pos - folio_pos(folio); + + lockdep_assert_held(&inode->v.i_rwsem); + BUG_ON(offset + copied > folio_size(folio)); + + if (unlikely(copied < len && !folio_test_uptodate(folio))) { + /* + * The folio needs to be read in, but that would destroy + * our partial write - simplest thing is to just force + * userspace to redo the write: + */ + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + copied = 0; + } + + spin_lock(&inode->v.i_lock); + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); + + if (copied) { + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + + bch2_set_folio_dirty(c, inode, folio, res, offset, copied); + + inode->ei_last_dirtied = (unsigned long) current; + } + + folio_unlock(folio); + folio_put(folio); + bch2_pagecache_add_put(inode); + + bch2_folio_reservation_put(c, inode, res); + kfree(res); + + return copied; +} + +static noinline void folios_trunc(folios *fs, struct folio **fi) +{ + while (fs->data + fs->nr > fi) { + struct folio *f = darray_pop(fs); + + folio_unlock(f); + folio_put(f); + } +} + +static int __bch2_buffered_write(struct bch_inode_info *inode, + struct address_space *mapping, + struct iov_iter *iter, + loff_t pos, unsigned len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + folios fs; + struct folio **fi, *f; + unsigned copied = 0, f_offset, f_copied; + u64 end = pos + len, f_pos, f_len; + loff_t last_folio_pos = inode->v.i_size; + int ret = 0; + + BUG_ON(!len); + + bch2_folio_reservation_init(c, inode, &res); + darray_init(&fs); + + ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, + mapping_gfp_mask(mapping), + &fs); + if (ret) + goto out; + + BUG_ON(!fs.nr); + + f = darray_first(fs); + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + + f = darray_last(fs); + end = min(end, folio_end_pos(f)); + last_folio_pos = folio_pos(f); + if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { + if (end >= inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + } else { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + } + + ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); + if (ret) + goto out; + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; + + /* + * XXX: per POSIX and fstests generic/275, on -ENOSPC we're + * supposed to write as much as we have disk space for. + * + * On failure here we should still write out a partial page if + * we aren't completely out of disk space - we don't do that + * yet: + */ + ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); + if (unlikely(ret)) { + folios_trunc(&fs, fi); + if (!fs.nr) + goto out; + + end = min(end, folio_end_pos(darray_last(fs))); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (mapping_writably_mapped(mapping)) + darray_for_each(fs, fi) + flush_dcache_folio(*fi); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; + f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + if (!f_copied) { + folios_trunc(&fs, fi); + break; + } + + if (!folio_test_uptodate(f) && + f_copied != folio_size(f) && + pos + copied + f_copied < inode->v.i_size) { + iov_iter_revert(iter, f_copied); + folio_zero_range(f, 0, folio_size(f)); + folios_trunc(&fs, fi); + break; + } + + flush_dcache_folio(f); + copied += f_copied; + + if (f_copied != f_len) { + folios_trunc(&fs, fi + 1); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (!copied) + goto out; + + end = pos + copied; + + spin_lock(&inode->v.i_lock); + if (end > inode->v.i_size) + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; + + if (!folio_test_uptodate(f)) + folio_mark_uptodate(f); + + bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + inode->ei_last_dirtied = (unsigned long) current; +out: + darray_for_each(fs, fi) { + folio_unlock(*fi); + folio_put(*fi); + } + + /* + * If the last folio added to the mapping starts beyond current EOF, we + * performed a short write but left around at least one post-EOF folio. + * Clean up the mapping before we return. + */ + if (last_folio_pos >= inode->v.i_size) + truncate_pagecache(&inode->v, inode->v.i_size); + + darray_exit(&fs); + bch2_folio_reservation_put(c, inode, &res); + + return copied ?: ret; +} + +static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; + + bch2_pagecache_add_get(inode); + + do { + unsigned offset = pos & (PAGE_SIZE - 1); + unsigned bytes = iov_iter_count(iter); +again: + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + bytes = min_t(unsigned long, iov_iter_count(iter), + PAGE_SIZE - offset); + + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + ret = -EFAULT; + break; + } + } + + if (unlikely(fatal_signal_pending(current))) { + ret = -EINTR; + break; + } + + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + if (unlikely(ret < 0)) + break; + + cond_resched(); + + if (unlikely(ret == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(iter)); + goto again; + } + pos += ret; + written += ret; + ret = 0; + + balance_dirty_pages_ratelimited(mapping); + } while (iov_iter_count(iter)); + + bch2_pagecache_add_put(inode); + + return written ? written : ret; +} + +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: + return bch2_err_class(ret); +} + +void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) +{ + bioset_exit(&c->writepage_bioset); +} + +int bch2_fs_fs_io_buffered_init(struct bch_fs *c) +{ + if (bioset_init(&c->writepage_bioset, + 4, offsetof(struct bch_writepage_io, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_writepage_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h new file mode 100644 index 000000000000..a6126ff790e6 --- /dev/null +++ b/fs/bcachefs/fs-io-buffered.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_BUFFERED_H +#define _BCACHEFS_FS_IO_BUFFERED_H + +#ifndef NO_BCACHEFS_FS + +int bch2_read_single_folio(struct folio *, struct address_space *); +int bch2_read_folio(struct file *, struct folio *); + +int bch2_writepages(struct address_space *, struct writeback_control *); +void bch2_readahead(struct readahead_control *); + +int bch2_write_begin(struct file *, struct address_space *, loff_t, + unsigned, struct page **, void **); +int bch2_write_end(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page *, void *); + +ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); + +void bch2_fs_fs_io_buffered_exit(struct bch_fs *); +int bch2_fs_fs_io_buffered_init(struct bch_fs *); +#else +static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} +static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c new file mode 100644 index 000000000000..9a479e4de6b3 --- /dev/null +++ b/fs/bcachefs/fs-io-direct.c @@ -0,0 +1,680 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "fs.h" +#include "fs-io.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" +#include "io_read.h" +#include "io_write.h" + +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/prefetch.h> +#include <linux/task_io_accounting_ops.h> + +/* O_DIRECT reads */ + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + bool should_dirty; + struct bch_read_bio rbio; +}; + +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static CLOSURE_CALLBACK(bch2_dio_read_complete) +{ + closure_type(dio, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret); + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); +} + +static void bch2_direct_IO_read_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + + if (bio->bi_status) + dio->ret = blk_status_to_errno(bio->bi_status); + + closure_put(&dio->cl); +} + +static void bch2_direct_IO_read_split_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + bch2_direct_IO_read_endio(bio); + bio_check_or_release(bio, should_dirty); +} + +static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct dio_read *dio; + struct bio *bio; + loff_t offset = req->ki_pos; + bool sync = is_sync_kiocb(req); + size_t shorten; + ssize_t ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + if ((offset|iter->count) & (block_bytes(c) - 1)) + return -EINVAL; + + ret = min_t(loff_t, iter->count, + max_t(loff_t, 0, i_size_read(&inode->v) - offset)); + + if (!ret) + return ret; + + shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + iter->count -= shorten; + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->dio_read_bioset); + + bio->bi_end_io = bch2_direct_IO_read_endio; + + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + + /* + * this is a _really_ horrible hack just to avoid an atomic sub at the + * end: + */ + if (!sync) { + set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_RUNNING + + CLOSURE_DESTRUCTOR); + } else { + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER + 1); + dio->cl.closure_get_happened = true; + } + + dio->req = req; + dio->ret = ret; + /* + * This is one of the sketchier things I've encountered: we have to skip + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ + dio->should_dirty = iter_is_iovec(iter); + + goto start; + while (iter->count) { + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->bio_read); + bio->bi_end_io = bch2_direct_IO_read_split_endio; +start: + bio->bi_opf = REQ_OP_READ|REQ_SYNC; + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_private = dio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (ret < 0) { + /* XXX: fault inject this path */ + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + break; + } + + offset += bio->bi_iter.bi_size; + + if (dio->should_dirty) + bio_set_pages_dirty(bio); + + if (iter->count) + closure_get(&dio->cl); + + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + } + + iter->count += shorten; + + if (sync) { + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); + return ret; + } else { + return -EIOCBQUEUED; + } +} + +ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct blk_plug plug; + + if (unlikely(mapping->nrpages)) { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + goto out; + } + + file_accessed(file); + + blk_start_plug(&plug); + ret = bch2_direct_IO_read(iocb, iter); + blk_finish_plug(&plug); + + if (ret >= 0) + iocb->ki_pos += ret; + } else { + bch2_pagecache_add_get(inode); + ret = generic_file_read_iter(iocb, iter); + bch2_pagecache_add_put(inode); + } +out: + return bch2_err_class(ret); +} + +/* O_DIRECT writes */ + +struct dio_write { + struct kiocb *req; + struct address_space *mapping; + struct bch_inode_info *inode; + struct mm_struct *mm; + unsigned loop:1, + extending:1, + sync:1, + flush:1, + free_iov:1; + struct quota_res quota_res; + u64 written; + + struct iov_iter iter; + struct iovec inline_vecs[2]; + + /* must be last: */ + struct bch_write_op op; +}; + +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; +retry: + bch2_trans_begin(trans); + + err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(err, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_put(trans); + + return err ? false : ret; +} + +static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + return bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *); +static __always_inline long bch2_dio_write_done(struct dio_write *dio); + +/* + * We're going to return -EIOCBQUEUED, but we haven't finished consuming the + * iov_iter yet, so we need to stash a copy of the iovec: it might be on the + * caller's stack, we're not guaranteed that it will live for the duration of + * the IO: + */ +static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) +{ + struct iovec *iov = dio->inline_vecs; + + /* + * iov_iter has a single embedded iovec - nothing to do: + */ + if (iter_is_ubuf(&dio->iter)) + return 0; + + /* + * We don't currently handle non-iovec iov_iters here - return an error, + * and we'll fall back to doing the IO synchronously: + */ + if (!iter_is_iovec(&dio->iter)) + return -1; + + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + + dio->free_iov = true; + } + + memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.__iov = iov; + return 0; +} + +static CLOSURE_CALLBACK(bch2_dio_write_flush_done) +{ + closure_type(dio, struct dio_write, op.cl); + struct bch_fs *c = dio->op.c; + + closure_debug_destroy(cl); + + dio->op.error = bch2_journal_error(&c->journal); + + bch2_dio_write_done(dio); +} + +static noinline void bch2_dio_write_flush(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_unpacked inode; + int ret; + + dio->flush = 0; + + closure_init(&dio->op.cl, NULL); + + if (!dio->op.error) { + ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); + if (ret) { + dio->op.error = ret; + } else { + bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, + &dio->op.cl); + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); + } + } + + if (dio->sync) { + closure_sync(&dio->op.cl); + closure_debug_destroy(&dio->op.cl); + } else { + continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); + } +} + +static __always_inline long bch2_dio_write_done(struct dio_write *dio) +{ + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + bool sync = dio->sync; + long ret; + + if (unlikely(dio->flush)) { + bch2_dio_write_flush(dio); + if (!sync) + return -EIOCBQUEUED; + } + + bch2_pagecache_block_put(inode); + + if (dio->free_iov) + kfree(dio->iter.__iov); + + ret = dio->op.error ?: ((long) dio->written << 9); + bio_put(&dio->op.wbio.bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ + inode_dio_end(&inode->v); + + if (ret < 0) + ret = bch2_err_class(ret); + + if (!sync) { + req->ki_complete(req, ret); + ret = -EIOCBQUEUED; + } + return ret; +} + +static __always_inline void bch2_dio_write_end(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + req->ki_pos += (u64) dio->op.written << 9; + dio->written += dio->op.written; + + if (dio->extending) { + spin_lock(&inode->v.i_lock); + if (req->ki_pos > inode->v.i_size) + i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + } + + if (dio->op.i_sectors_delta || dio->quota_res.sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + __bch2_quota_reservation_put(c, inode, &dio->quota_res); + mutex_unlock(&inode->ei_quota_lock); + } + + bio_release_pages(bio, false); + + if (unlikely(dio->op.error)) + set_bit(EI_INODE_ERROR, &inode->ei_flags); +} + +static __always_inline long bch2_dio_write_loop(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct address_space *mapping = dio->mapping; + struct bch_inode_info *inode = dio->inode; + struct bch_io_opts opts; + struct bio *bio = &dio->op.wbio.bio; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; + long ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + while (1) { + iter_count = dio->iter.count; + + EBUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; + + ret = bio_iov_iter_get_pages(bio, &dio->iter); + + dropped_locks = fdm_dropped_locks(); + + current->faults_disabled_mapping = NULL; + + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; + + if (unlikely(ret < 0)) + goto err; + + if (unlikely(dropped_locks)) { + ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); + + if (!bio->bi_iter.bi_size) { + /* + * bio_iov_iter_get_pages was only able to get < + * blocksize worth of pages: + */ + ret = -EFAULT; + goto err; + } + + bch2_write_op_init(&dio->op, c, opts); + dio->op.end_io = sync + ? NULL + : bch2_dio_write_loop_async; + dio->op.target = dio->op.opts.foreground_target; + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; + dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); + dio->op.devs_need_flush = &inode->ei_devs_need_flush; + + if (sync) + dio->op.flags |= BCH_WRITE_SYNC; + dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + bio_sectors(bio), true); + if (unlikely(ret)) + goto err; + + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && + !bch2_dio_write_check_allocated(dio)) + goto err; + + task_io_account_write(bio->bi_iter.bi_size); + + if (unlikely(dio->iter.count) && + !dio->sync && + !dio->loop && + bch2_dio_write_copy_iov(dio)) + dio->sync = sync = true; + + dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); + + if (!sync) + return -EIOCBQUEUED; + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + break; + + bio_reset(bio, NULL, REQ_OP_WRITE); + } +out: + return bch2_dio_write_done(dio); +err: + dio->op.error = ret; + + bio_release_pages(bio, false); + + bch2_quota_reservation_put(c, inode, &dio->quota_res); + goto out; +} + +static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) +{ + struct mm_struct *mm = dio->mm; + + bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + + if (mm) + kthread_use_mm(mm); + bch2_dio_write_loop(dio); + if (mm) + kthread_unuse_mm(mm); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *op) +{ + struct dio_write *dio = container_of(op, struct dio_write, op); + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + bch2_dio_write_done(dio); + else + bch2_dio_write_continue(dio); +} + +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct dio_write *dio; + struct bio *bio; + bool locked = true, extending; + ssize_t ret; + + prefetch(&c->opts); + prefetch((void *) &c->opts + 64); + prefetch(&inode->ei_inode); + prefetch((void *) &inode->ei_inode + 64); + + inode_lock(&inode->v); + + ret = generic_write_checks(req, iter); + if (unlikely(ret <= 0)) + goto err; + + ret = file_remove_privs(file); + if (unlikely(ret)) + goto err; + + ret = file_update_time(file); + if (unlikely(ret)) + goto err; + + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + goto err; + + inode_dio_begin(&inode->v); + bch2_pagecache_block_get(inode); + + extending = req->ki_pos + iter->count > inode->v.i_size; + if (!extending) { + inode_unlock(&inode->v); + locked = false; + } + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_WRITE, + GFP_KERNEL, + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); + dio->req = req; + dio->mapping = mapping; + dio->inode = inode; + dio->mm = current->mm; + dio->loop = false; + dio->extending = extending; + dio->sync = is_sync_kiocb(req) || extending; + dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; + dio->free_iov = false; + dio->quota_res.sectors = 0; + dio->written = 0; + dio->iter = *iter; + dio->op.c = c; + + if (unlikely(mapping->nrpages)) { + ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; + } + + ret = bch2_dio_write_loop(dio); +err: + if (locked) + inode_unlock(&inode->v); + return ret; +err_put_bio: + bch2_pagecache_block_put(inode); + bio_put(bio); + inode_dio_end(&inode->v); + goto err; +} + +void bch2_fs_fs_io_direct_exit(struct bch_fs *c) +{ + bioset_exit(&c->dio_write_bioset); + bioset_exit(&c->dio_read_bioset); +} + +int bch2_fs_fs_io_direct_init(struct bch_fs *c) +{ + if (bioset_init(&c->dio_read_bioset, + 4, offsetof(struct dio_read, rbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_read_bioset_init; + + if (bioset_init(&c->dio_write_bioset, + 4, offsetof(struct dio_write, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_write_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h new file mode 100644 index 000000000000..814621ec7f81 --- /dev/null +++ b/fs/bcachefs/fs-io-direct.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_DIRECT_H +#define _BCACHEFS_FS_IO_DIRECT_H + +#ifndef NO_BCACHEFS_FS +ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); +ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); + +void bch2_fs_fs_io_direct_exit(struct bch_fs *); +int bch2_fs_fs_io_direct_init(struct bch_fs *); +#else +static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} +static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_DIRECT_H */ diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c new file mode 100644 index 000000000000..ff664fd0d8ef --- /dev/null +++ b/fs/bcachefs/fs-io-pagecache.c @@ -0,0 +1,791 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "btree_iter.h" +#include "extents.h" +#include "fs-io.h" +#include "fs-io-pagecache.h" +#include "subvolume.h" + +#include <linux/pagevec.h> +#include <linux/writeback.h> + +int bch2_filemap_get_contig_folios_d(struct address_space *mapping, + loff_t start, u64 end, + fgf_t fgp_flags, gfp_t gfp, + folios *fs) +{ + struct folio *f; + u64 pos = start; + int ret = 0; + + while (pos < end) { + if ((u64) pos >= (u64) start + (1ULL << 20)) + fgp_flags &= ~FGP_CREAT; + + ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL); + if (ret) + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); + if (IS_ERR_OR_NULL(f)) + break; + + BUG_ON(fs->nr && folio_pos(f) != pos); + + pos = folio_end_pos(f); + darray_push(fs, f); + } + + if (!fs->nr && !ret && (fgp_flags & FGP_CREAT)) + ret = -ENOMEM; + + return fs->nr ? 0 : ret; +} + +/* pagecache_block must be held */ +int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + int ret; + + /* + * XXX: the way this is currently implemented, we can spin if a process + * is continually redirtying a specific page + */ + do { + if (!mapping->nrpages) + return 0; + + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + break; + + if (!mapping->nrpages) + return 0; + + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } while (ret == -EBUSY); + + return ret; +} + +#if 0 +/* Useful for debug tracing: */ +static const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x + NULL +}; +#endif + +static inline enum bch_folio_sector_state +folio_sector_dirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_dirty; + case SECTOR_reserved: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_undirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_dirty: + return SECTOR_unallocated; + case SECTOR_dirty_reserved: + return SECTOR_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_reserve(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_reserved; + case SECTOR_dirty: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +/* for newly allocated folios: */ +struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + struct bch_folio *s; + + s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), gfp); + if (!s) + return NULL; + + spin_lock_init(&s->lock); + folio_attach_private(folio, s); + return s; +} + +struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); +} + +static unsigned bkey_to_sector_state(struct bkey_s_c k) +{ + if (bkey_extent_is_reservation(k)) + return SECTOR_reserved; + if (bkey_extent_is_allocation(k.k)) + return SECTOR_allocated; + return SECTOR_unallocated; +} + +static void __bch2_folio_set(struct folio *folio, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + bch2_folio_sector_set(folio, s, i, state); + } + + if (i == sectors) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +/* + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the + * extents btree: + */ +int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + struct folio **fs, unsigned nr_folios) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_folio *s; + u64 offset = folio_sector(fs[0]); + unsigned folio_idx; + u32 snapshot; + bool need_set = false; + int ret; + + for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); + if (!s) + return -ENOMEM; + + need_set |= !s->uptodate; + } + + if (!need_set) + return 0; + + folio_idx = 0; + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + while (folio_idx < nr_folios) { + struct folio *folio = fs[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - + folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - + folio_offset - folio_start; + + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); + + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } + + if (folio_idx == nr_folios) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_put(trans); + + return ret; +} + +void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct folio_vec fv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + bio_for_each_folio(fv, bio, iter) + __bch2_folio_set(fv.fv_folio, + fv.fv_offset >> 9, + fv.fv_len >> 9, + nr_ptrs, state); +} + +void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + s64 i_sectors_delta = 0; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, j, + folio_sector_reserve(s->s[j].state)); + } + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + +static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, + unsigned nr_replicas) +{ + return max(0, (int) nr_replicas - + s->nr_replicas - + s->replicas_reserved); +} + +int bch2_get_folio_disk_reservation(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, bool check_enospc) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned nr_replicas = inode_nr_replicas(c, inode); + struct disk_reservation disk_res = { 0 }; + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + for (i = 0; i < sectors; i++) + disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); + + if (!disk_res_sectors) + return 0; + + ret = bch2_disk_reservation_get(c, &disk_res, + disk_res_sectors, 1, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); + if (unlikely(ret)) + return ret; + + for (i = 0; i < sectors; i++) + s->s[i].replicas_reserved += + sectors_to_reserve(&s->s[i], nr_replicas); + + return 0; +} + +void bch2_folio_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + bch2_disk_reservation_put(c, &res->disk); + bch2_quota_reservation_put(c, inode, &res->quota); +} + +int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned i, disk_sectors = 0, quota_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + BUG_ON(!s->uptodate); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + + if (disk_sectors) { + ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + if (unlikely(ret)) + return ret; + } + + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, + quota_sectors, true); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors + }; + + bch2_disk_reservation_put(c, &tmp); + res->disk.sectors -= disk_sectors; + return ret; + } + } + + return 0; +} + +static void bch2_clear_folio_bits(struct folio *folio) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_folio *s = bch2_folio(folio); + struct disk_reservation disk_res = { 0 }; + int i, sectors = folio_sectors(folio), dirty_sectors = 0; + + if (!s) + return; + + EBUG_ON(!folio_test_locked(folio)); + EBUG_ON(folio_test_writeback(folio)); + + for (i = 0; i < sectors; i++) { + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + + dirty_sectors -= s->s[i].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); + } + + bch2_disk_reservation_put(c, &disk_res); + + bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_folio_release(folio); +} + +void bch2_set_folio_dirty(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, dirty_sectors = 0; + + WARN_ON((u64) folio_pos(folio) + offset + len > + round_up((u64) i_size_read(&inode->v), block_bytes(c))); + + BUG_ON(!s->uptodate); + + spin_lock(&s->lock); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); + + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; + + dirty_sectors += s->s[i].state == SECTOR_unallocated; + + bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); + } + + spin_unlock(&s->lock); + + bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); + + if (!folio_test_dirty(folio)) + filemap_dirty_folio(inode->v.i_mapping, folio); +} + +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); + vm_fault_t ret; + + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(inode)) + goto got_lock; + + bch2_pagecache_block_put(fdm_host); + + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); + + bch2_pagecache_block_get(fdm_host); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + + bch2_pagecache_add_get(inode); +got_lock: + ret = filemap_fault(vmf); + bch2_pagecache_add_put(inode); + + return ret; +} + +vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + unsigned len; + loff_t isize; + vm_fault_t ret; + + bch2_folio_reservation_init(c, inode, &res); + + sb_start_pagefault(inode->v.i_sb); + file_update_time(file); + + /* + * Not strictly necessary, but helps avoid dio writes livelocking in + * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get + * a bch2_write_invalidate_inode_pages_range() that works without dropping + * page lock before invalidating page + */ + bch2_pagecache_add_get(inode); + + folio_lock(folio); + isize = i_size_read(&inode->v); + + if (folio->mapping != mapping || folio_pos(folio) >= isize) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } + + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); + + if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: + bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + folio_unlock(folio); + ret = VM_FAULT_SIGBUS; + goto out; + } + + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_folio_reservation_put(c, inode, &res); + + folio_wait_stable(folio); + ret = VM_FAULT_LOCKED; +out: + bch2_pagecache_add_put(inode); + sb_end_pagefault(inode->v.i_sb); + + return ret; +} + +void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + if (offset || length < folio_size(folio)) + return; + + bch2_clear_folio_bits(folio); +} + +bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) +{ + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return false; + + bch2_clear_folio_bits(folio); + return true; +} + +/* fseek: */ + +static int folio_data_offset(struct folio *folio, loff_t pos, + unsigned min_replicas) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + if (s) + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) + if (s->s[i].state >= SECTOR_dirty && + s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) + return i << SECTOR_SHIFT; + + return -1; +} + +loff_t bch2_seek_pagecache_data(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct folio_batch fbatch; + pgoff_t start_index = start_offset >> PAGE_SHIFT; + pgoff_t end_index = end_offset >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned i; + loff_t ret; + int offset; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(vinode->i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + if (!nonblock) { + folio_lock(folio); + } else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + return -EAGAIN; + } + + offset = folio_data_offset(folio, + max(folio_pos(folio), start_offset), + min_replicas); + if (offset >= 0) { + ret = clamp(folio_pos(folio) + offset, + start_offset, end_offset); + folio_unlock(folio); + folio_batch_release(&fbatch); + return ret; + } + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return end_offset; +} + +/* + * Search for a hole in a folio. + * + * The filemap layer returns -ENOENT if no folio exists, so reuse the same error + * code to indicate a pagecache hole exists at the returned offset. Otherwise + * return 0 if the folio is filled with data, or an error code. This function + * can return -EAGAIN if nonblock is specified. + */ +static int folio_hole_offset(struct address_space *mapping, loff_t *offset, + unsigned min_replicas, bool nonblock) +{ + struct folio *folio; + struct bch_folio *s; + unsigned i, sectors; + int ret = -ENOENT; + + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, + FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + s = bch2_folio(folio); + if (!s) + goto unlock; + + sectors = folio_sectors(folio); + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) + if (s->s[i].state < SECTOR_dirty || + s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { + *offset = max(*offset, + folio_pos(folio) + (i << SECTOR_SHIFT)); + goto unlock; + } + + *offset = folio_end_pos(folio); + ret = 0; +unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +loff_t bch2_seek_pagecache_hole(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct address_space *mapping = vinode->i_mapping; + loff_t offset = start_offset; + loff_t ret = 0; + + while (!ret && offset < end_offset) + ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); + + if (ret && ret != -ENOENT) + return ret; + return min(offset, end_offset); +} + +int bch2_clamp_data_hole(struct inode *inode, + u64 *hole_start, + u64 *hole_end, + unsigned min_replicas, + bool nonblock) +{ + loff_t ret; + + ret = bch2_seek_pagecache_hole(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_start = ret; + + if (*hole_start == *hole_end) + return 0; + + ret = bch2_seek_pagecache_data(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_end = ret; + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h new file mode 100644 index 000000000000..27f712ae37a6 --- /dev/null +++ b/fs/bcachefs/fs-io-pagecache.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_PAGECACHE_H +#define _BCACHEFS_FS_IO_PAGECACHE_H + +#include <linux/pagemap.h> + +typedef DARRAY(struct folio *) folios; + +int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, + u64, fgf_t, gfp_t, folios *); +int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); + +/* + * Use u64 for the end pos and sector helpers because if the folio covers the + * max supported range of the mapping, the start offset of the next folio + * overflows loff_t. This breaks much of the range based processing in the + * buffered write path. + */ +static inline u64 folio_end_pos(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + +static inline size_t folio_sectors(struct folio *folio) +{ + return PAGE_SECTORS << folio_order(folio); +} + +static inline loff_t folio_sector(struct folio *folio) +{ + return folio_pos(folio) >> 9; +} + +static inline u64 folio_end_sector(struct folio *folio) +{ + return folio_end_pos(folio) >> 9; +} + +#define BCH_FOLIO_SECTOR_STATE() \ + x(unallocated) \ + x(reserved) \ + x(dirty) \ + x(dirty_reserved) \ + x(allocated) + +enum bch_folio_sector_state { +#define x(n) SECTOR_##n, + BCH_FOLIO_SECTOR_STATE() +#undef x +}; + +struct bch_folio_sector { + /* Uncompressed, fully allocated replicas (or on disk reservation): */ + unsigned nr_replicas:4; + + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ + unsigned replicas_reserved:4; + + /* i_sectors: */ + enum bch_folio_sector_state state:8; +}; + +struct bch_folio { + spinlock_t lock; + atomic_t write_count; + /* + * Is the sector state up to date with the btree? + * (Not the data itself) + */ + bool uptodate; + struct bch_folio_sector s[]; +}; + +/* Helper for when we need to add debug instrumentation: */ +static inline void bch2_folio_sector_set(struct folio *folio, + struct bch_folio *s, + unsigned i, unsigned n) +{ + s->s[i].state = n; +} + +/* file offset (to folio offset) to bch_folio_sector index */ +static inline int folio_pos_to_s(struct folio *folio, loff_t pos) +{ + u64 f_offset = pos - folio_pos(folio); + + BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); + return f_offset >> SECTOR_SHIFT; +} + +/* for newly allocated folios: */ +static inline void __bch2_folio_release(struct folio *folio) +{ + kfree(folio_detach_private(folio)); +} + +static inline void bch2_folio_release(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + __bch2_folio_release(folio); +} + +static inline struct bch_folio *__bch2_folio(struct folio *folio) +{ + return folio_has_private(folio) + ? (struct bch_folio *) folio_get_private(folio) + : NULL; +} + +static inline struct bch_folio *bch2_folio(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + + return __bch2_folio(folio); +} + +struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); +struct bch_folio *bch2_folio_create(struct folio *, gfp_t); + +struct bch2_folio_reservation { + struct disk_reservation disk; + struct quota_res quota; +}; + +static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) +{ + /* XXX: this should not be open coded */ + return inode->ei_inode.bi_data_replicas + ? inode->ei_inode.bi_data_replicas - 1 + : c->opts.data_replicas; +} + +static inline void bch2_folio_reservation_init(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + memset(res, 0, sizeof(*res)); + + res->disk.nr_replicas = inode_nr_replicas(c, inode); +} + +int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); +void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); + +void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); +void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); + +int bch2_get_folio_disk_reservation(struct bch_fs *, + struct bch_inode_info *, + struct folio *, bool); + +void bch2_folio_reservation_put(struct bch_fs *, + struct bch_inode_info *, + struct bch2_folio_reservation *); +int bch2_folio_reservation_get(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + unsigned, unsigned); + +void bch2_set_folio_dirty(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + unsigned, unsigned); + +vm_fault_t bch2_page_fault(struct vm_fault *); +vm_fault_t bch2_page_mkwrite(struct vm_fault *); +void bch2_invalidate_folio(struct folio *, size_t, size_t); +bool bch2_release_folio(struct folio *, gfp_t); + +loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); +loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); +int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); + +#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c new file mode 100644 index 000000000000..b0e8144ec550 --- /dev/null +++ b/fs/bcachefs/fs-io.c @@ -0,0 +1,1072 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" +#include "fs.h" +#include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-pagecache.h" +#include "fsck.h" +#include "inode.h" +#include "journal.h" +#include "io_misc.h" +#include "keylist.h" +#include "quota.h" +#include "reflink.h" +#include "trace.h" + +#include <linux/aio.h> +#include <linux/backing-dev.h> +#include <linux/falloc.h> +#include <linux/migrate.h> +#include <linux/mmu_context.h> +#include <linux/pagevec.h> +#include <linux/rmap.h> +#include <linux/sched/signal.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/uio.h> + +#include <trace/events/writeback.h> + +struct nocow_flush { + struct closure *cl; + struct bch_dev *ca; + struct bio bio; +}; + +static void nocow_flush_endio(struct bio *_bio) +{ + + struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); + + closure_put(bio->cl); + percpu_ref_put(&bio->ca->io_ref); + bio_put(&bio->bio); +} + +void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, + struct bch_inode_info *inode, + struct closure *cl) +{ + struct nocow_flush *bio; + struct bch_dev *ca; + struct bch_devs_mask devs; + unsigned dev; + + dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); + if (dev == BCH_SB_MEMBERS_MAX) + return; + + devs = inode->ei_devs_need_flush; + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); + + for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); + + if (!ca) + continue; + + bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, + REQ_OP_FLUSH, + GFP_KERNEL, + &c->nocow_flush_bioset), + struct nocow_flush, bio); + bio->cl = cl; + bio->ca = ca; + bio->bio.bi_end_io = nocow_flush_endio; + closure_bio_submit(&bio->bio, cl); + } +} + +static int bch2_inode_flush_nocow_writes(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct closure cl; + + closure_init_stack(&cl); + bch2_inode_flush_nocow_writes_async(c, inode, &cl); + closure_sync(&cl); + + return 0; +} + +/* i_size updates: */ + +struct inode_new_size { + loff_t new_size; + u64 now; + unsigned fields; +}; + +static int inode_set_size(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct inode_new_size *s = p; + + bi->bi_size = s->new_size; + if (s->fields & ATTR_ATIME) + bi->bi_atime = s->now; + if (s->fields & ATTR_MTIME) + bi->bi_mtime = s->now; + if (s->fields & ATTR_CTIME) + bi->bi_ctime = s->now; + + return 0; +} + +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) +{ + struct inode_new_size s = { + .new_size = new_size, + .now = bch2_current_time(c), + .fields = fields, + }; + + return bch2_write_inode(c, inode, inode_set_size, &s, fields); +} + +void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) +{ + bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, + "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + inode->v.i_blocks += sectors; + +#ifdef CONFIG_BCACHEFS_QUOTA + if (quota_res && + !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && + sectors > 0) { + BUG_ON(sectors > quota_res->sectors); + BUG_ON(sectors > inode->ei_quota_reserved); + + quota_res->sectors -= sectors; + inode->ei_quota_reserved -= sectors; + } else { + bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); + } +#endif +} + +/* fsync: */ + +/* + * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an + * insert trigger: look up the btree inode instead + */ +static int bch2_flush_inode(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct bch_inode_unpacked u; + int ret; + + if (c->opts.journal_flush_disabled) + return 0; + + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); + if (ret) + return ret; + + return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_inode_flush_nocow_writes(c, inode); +} + +int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret, ret2, ret3; + + ret = file_write_and_wait_range(file, start, end); + ret2 = sync_inode_metadata(&inode->v, 1); + ret3 = bch2_flush_inode(c, inode); + + return bch2_err_class(ret ?: ret2 ?: ret3); +} + +/* truncate: */ + +static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos start, + struct bpos end) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) + if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { + ret = 1; + break; + } + start = iter.pos; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + return ret; +} + +static int __bch2_truncate_folio(struct bch_inode_info *inode, + pgoff_t index, loff_t start, loff_t end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + struct bch_folio *s; + unsigned start_offset; + unsigned end_offset; + unsigned i; + struct folio *folio; + s64 i_sectors_delta = 0; + int ret = 0; + u64 end_pos; + + folio = filemap_lock_folio(mapping, index); + if (IS_ERR_OR_NULL(folio)) { + /* + * XXX: we're doing two index lookups when we end up reading the + * folio + */ + ret = range_has_data(c, inode->ei_subvol, + POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), + POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); + if (ret <= 0) + return ret; + + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT, GFP_KERNEL); + if (IS_ERR_OR_NULL(folio)) { + ret = -ENOMEM; + goto out; + } + } + + BUG_ON(start >= folio_end_pos(folio)); + BUG_ON(end <= folio_pos(folio)); + + start_offset = max(start, folio_pos(folio)) - folio_pos(folio); + end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); + + /* Folio boundary? Nothing to do */ + if (start_offset == 0 && + end_offset == folio_size(folio)) { + ret = 0; + goto unlock; + } + + s = bch2_folio_create(folio, 0); + if (!s) { + ret = -ENOMEM; + goto unlock; + } + + if (!folio_test_uptodate(folio)) { + ret = bch2_read_single_folio(folio, mapping); + if (ret) + goto unlock; + } + + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto unlock; + + for (i = round_up(start_offset, block_bytes(c)) >> 9; + i < round_down(end_offset, block_bytes(c)) >> 9; + i++) { + s->s[i].nr_replicas = 0; + + i_sectors_delta -= s->s[i].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); + } + + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + + /* + * Caller needs to know whether this folio will be written out by + * writeback - doing an i_size update if necessary - or whether it will + * be responsible for the i_size update. + * + * Note that we shouldn't ever see a folio beyond EOF, but check and + * warn if so. This has been observed by failure to clean up folios + * after a short write and there's still a chance reclaim will fix + * things up. + */ + WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); + end_pos = folio_end_pos(folio); + if (inode->v.i_size > folio_pos(folio)) + end_pos = min_t(u64, inode->v.i_size, end_pos); + ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; + + folio_zero_segment(folio, start_offset, end_offset); + + /* + * Bit of a hack - we don't want truncate to fail due to -ENOSPC. + * + * XXX: because we aren't currently tracking whether the folio has actual + * data in it (vs. just 0s, or only partially written) this wrong. ick. + */ + BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); + + /* + * This removes any writeable userspace mappings; we need to force + * .page_mkwrite to be called again before any mmapped writes, to + * redirty the full page: + */ + folio_mkclean(folio); + filemap_dirty_folio(mapping, folio); +unlock: + folio_unlock(folio); + folio_put(folio); +out: + return ret; +} + +static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) +{ + return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, + from, ANYSINT_MAX(loff_t)); +} + +static int bch2_truncate_folios(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, + start, end); + + if (ret >= 0 && + start >> PAGE_SHIFT != end >> PAGE_SHIFT) + ret = __bch2_truncate_folio(inode, + (end - 1) >> PAGE_SHIFT, + start, end); + return ret; +} + +static int bch2_extend(struct mnt_idmap *idmap, + struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, + struct iattr *iattr) +{ + struct address_space *mapping = inode->v.i_mapping; + int ret; + + /* + * sync appends: + * + * this has to be done _before_ extending i_size: + */ + ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); + if (ret) + return ret; + + truncate_setsize(&inode->v, iattr->ia_size); + + return bch2_setattr_nonsize(idmap, inode, iattr); +} + +int bchfs_truncate(struct mnt_idmap *idmap, + struct bch_inode_info *inode, struct iattr *iattr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + struct bch_inode_unpacked inode_u; + s64 i_sectors_delta = 0; + int ret = 0; + + /* + * If the truncate call with change the size of the file, the + * cmtimes should be updated. If the size will not change, we + * do not need to update the cmtimes. + */ + if (iattr->ia_size != inode->v.i_size) { + if (!(iattr->ia_valid & ATTR_MTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_mtime); + if (!(iattr->ia_valid & ATTR_CTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_ctime); + iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; + } + + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(inode); + + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); + if (ret) + goto err; + + /* + * check this before next assertion; on filesystem error our normal + * invariants are a bit broken (truncate has to truncate the page cache + * before the inode). + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && + inode->v.i_size < inode_u.bi_size, + "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", + (u64) inode->v.i_size, inode_u.bi_size); + + if (iattr->ia_size > inode->v.i_size) { + ret = bch2_extend(idmap, inode, &inode_u, iattr); + goto err; + } + + iattr->ia_valid &= ~ATTR_SIZE; + + ret = bch2_truncate_folio(inode, iattr->ia_size); + if (unlikely(ret < 0)) + goto err; + + truncate_setsize(&inode->v, iattr->ia_size); + + /* + * When extending, we're going to write the new i_size to disk + * immediately so we need to flush anything above the current on disk + * i_size first: + * + * Also, when extending we need to flush the page that i_size currently + * straddles - if it's mapped to userspace, we need to ensure that + * userspace has to redirty it and call .mkwrite -> set_page_dirty + * again to allocate the part of the page that was extended. + */ + if (iattr->ia_size > inode_u.bi_size) + ret = filemap_write_and_wait_range(mapping, + inode_u.bi_size, + iattr->ia_size - 1); + else if (iattr->ia_size & (PAGE_SIZE - 1)) + ret = filemap_write_and_wait_range(mapping, + round_down(iattr->ia_size, PAGE_SIZE), + iattr->ia_size - 1); + if (ret) + goto err; + + ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + + if (unlikely(ret)) { + /* + * If we error here, VFS caches are now inconsistent with btree + */ + set_bit(EI_INODE_ERROR, &inode->ei_flags); + goto err; + } + + bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal), c, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); + + ret = bch2_setattr_nonsize(idmap, inode, iattr); +err: + bch2_pagecache_block_put(inode); + return bch2_err_class(ret); +} + +/* fallocate: */ + +static int inode_update_times_fn(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); + return 0; +} + +static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + u64 end = offset + len; + u64 block_start = round_up(offset, block_bytes(c)); + u64 block_end = round_down(end, block_bytes(c)); + bool truncated_last_page; + int ret = 0; + + ret = bch2_truncate_folios(inode, offset, end); + if (unlikely(ret < 0)) + goto err; + + truncated_last_page = ret; + + truncate_pagecache_range(&inode->v, offset, end - 1); + + if (block_start < block_end) { + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode_inum(inode), + block_start >> 9, block_end >> 9, + &i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + } + + mutex_lock(&inode->ei_update_lock); + if (end >= inode->v.i_size && !truncated_last_page) { + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); + } else { + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + } + mutex_unlock(&inode->ei_update_lock); +err: + return ret; +} + +static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + loff_t offset, loff_t len, + bool insert) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + s64 i_sectors_delta = 0; + int ret = 0; + + if ((offset | len) & (block_bytes(c) - 1)) + return -EINVAL; + + if (insert) { + if (offset >= inode->v.i_size) + return -EINVAL; + } else { + if (offset + len >= inode->v.i_size) + return -EINVAL; + } + + ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); + if (ret) + return ret; + + if (insert) + i_size_write(&inode->v, inode->v.i_size + len); + + ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9, + insert, &i_sectors_delta); + if (!ret && !insert) + i_size_write(&inode->v, inode->v.i_size - len); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + + return ret; +} + +static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + u64 start_sector, u64 end_sector) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bpos end_pos = POS(inode->v.i_ino, end_sector); + struct bch_io_opts opts; + int ret = 0; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inode->v.i_ino, start_sector), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (!ret && bkey_lt(iter.pos, end_pos)) { + s64 i_sectors_delta = 0; + struct quota_res quota_res = { 0 }; + struct bkey_s_c k; + unsigned sectors; + bool is_allocation; + u64 hole_start, hole_end; + u32 snapshot; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, + inode->ei_subvol, &snapshot); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) + goto bkey_err; + + hole_start = iter.pos.offset; + hole_end = bpos_min(k.k->p, end_pos).offset; + is_allocation = bkey_extent_is_allocation(k.k); + + /* already reserved */ + if (bkey_extent_is_reservation(k) && + bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { + bch2_btree_iter_advance(&iter); + continue; + } + + if (bkey_extent_is_data(k.k) && + !(mode & FALLOC_FL_ZERO_RANGE)) { + bch2_btree_iter_advance(&iter); + continue; + } + + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + /* + * Lock ordering - can't be holding btree locks while + * blocking on a folio lock: + */ + if (bch2_clamp_data_hole(&inode->v, + &hole_start, + &hole_end, + opts.data_replicas, true)) + ret = drop_locks_do(trans, + (bch2_clamp_data_hole(&inode->v, + &hole_start, + &hole_end, + opts.data_replicas, false), 0)); + bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + + if (ret) + goto bkey_err; + + if (hole_start == hole_end) + continue; + } + + sectors = hole_end - hole_start; + + if (!is_allocation) { + ret = bch2_quota_reservation_add(c, inode, + "a_res, sectors, true); + if (unlikely(ret)) + goto bkey_err; + } + + ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter, + sectors, opts, &i_sectors_delta, + writepoint_hashed((unsigned long) current)); + if (ret) + goto bkey_err; + + bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); + + drop_locks_do(trans, + (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); +bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + + if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + + bch2_fpunch_at(trans, &iter, inode_inum(inode), + end_sector, &i_sectors_delta); + bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_quota_reservation_put(c, inode, "a_res); + } + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + loff_t offset, loff_t len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + u64 end = offset + len; + u64 block_start = round_down(offset, block_bytes(c)); + u64 block_end = round_up(end, block_bytes(c)); + bool truncated_last_page = false; + int ret, ret2 = 0; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { + ret = inode_newsize_ok(&inode->v, end); + if (ret) + return ret; + } + + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = bch2_truncate_folios(inode, offset, end); + if (unlikely(ret < 0)) + return ret; + + truncated_last_page = ret; + + truncate_pagecache_range(&inode->v, offset, end - 1); + + block_start = round_up(offset, block_bytes(c)); + block_end = round_down(end, block_bytes(c)); + } + + ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); + + /* + * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, + * so that the VFS cache i_size is consistent with the btree i_size: + */ + if (ret && + !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) + return ret; + + if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) + end = inode->v.i_size; + + if (end >= inode->v.i_size && + (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || + !(mode & FALLOC_FL_KEEP_SIZE))) { + spin_lock(&inode->v.i_lock); + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); + + mutex_lock(&inode->ei_update_lock); + ret2 = bch2_write_inode_size(c, inode, end, 0); + mutex_unlock(&inode->ei_update_lock); + } + + return ret ?: ret2; +} + +long bch2_fallocate_dispatch(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) + return -EROFS; + + inode_lock(&inode->v); + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(inode); + + ret = file_modified(file); + if (ret) + goto err; + + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) + ret = bchfs_fallocate(inode, mode, offset, len); + else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) + ret = bchfs_fpunch(inode, offset, len); + else if (mode == FALLOC_FL_INSERT_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, true); + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, false); + else + ret = -EOPNOTSUPP; +err: + bch2_pagecache_block_put(inode); + inode_unlock(&inode->v); + bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); + + return bch2_err_class(ret); +} + +/* + * Take a quota reservation for unallocated blocks in a given file range + * Does not check pagecache + */ +static int quota_reserve_range(struct bch_inode_info *inode, + struct quota_res *res, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot; + u64 sectors = end - start; + u64 pos = start; + int ret; +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, pos, snapshot), 0); + + while (!(ret = btree_trans_too_many_iters(trans)) && + (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && + !(ret = bkey_err(k))) { + if (bkey_extent_is_allocation(k.k)) { + u64 s = min(end, k.k->p.offset) - + max(start, bkey_start_offset(k.k)); + BUG_ON(s > sectors); + sectors -= s; + } + bch2_btree_iter_advance(&iter); + } + pos = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + + return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); +} + +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) +{ + struct bch_inode_info *src = file_bch_inode(file_src); + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + u64 aligned_len; + loff_t ret = 0; + + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if ((pos_src & (block_bytes(c) - 1)) || + (pos_dst & (block_bytes(c) - 1))) + return -EINVAL; + + if (src == dst && + abs(pos_src - pos_dst) < len) + return -EINVAL; + + bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + inode_dio_wait(&src->v); + inode_dio_wait(&dst->v); + + ret = generic_remap_file_range_prep(file_src, pos_src, + file_dst, pos_dst, + &len, remap_flags); + if (ret < 0 || len == 0) + goto err; + + aligned_len = round_up((u64) len, block_bytes(c)); + + ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, + pos_dst, pos_dst + len - 1); + if (ret) + goto err; + + ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, + (pos_dst + aligned_len) >> 9); + if (ret) + goto err; + + file_update_time(file_dst); + + bch2_mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); + + ret = bch2_remap_range(c, + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, + aligned_len >> 9, + pos_dst + len, &i_sectors_delta); + if (ret < 0) + goto err; + + /* + * due to alignment, we might have remapped slightly more than requsted + */ + ret = min((u64) ret << 9, (u64) len); + + bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); + + spin_lock(&dst->v.i_lock); + if (pos_dst + ret > dst->v.i_size) + i_size_write(&dst->v, pos_dst + ret); + spin_unlock(&dst->v.i_lock); + + if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || + IS_SYNC(file_inode(file_dst))) + ret = bch2_flush_inode(c, dst); +err: + bch2_quota_reservation_put(c, dst, "a_res); + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + return bch2_err_class(ret); +} + +/* fseek: */ + +static loff_t bch2_seek_data(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); + u64 isize, next_data = MAX_LFS_FILESIZE; + u32 snapshot; + int ret; + + isize = i_size_read(&inode->v); + if (offset >= isize) + return -ENXIO; + + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), + POS(inode->v.i_ino, U64_MAX), + 0, k, ret) { + if (bkey_extent_is_data(k.k)) { + next_data = max(offset, bkey_start_offset(k.k) << 9); + break; + } else if (k.k->p.offset >> 9 > isize) + break; + } + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + if (ret) + return ret; + + if (next_data > offset) + next_data = bch2_seek_pagecache_data(&inode->v, + offset, next_data, 0, false); + + if (next_data >= isize) + return -ENXIO; + + return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); +} + +static loff_t bch2_seek_hole(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); + u64 isize, next_hole = MAX_LFS_FILESIZE; + u32 snapshot; + int ret; + + isize = i_size_read(&inode->v); + if (offset >= isize) + return -ENXIO; + + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), + BTREE_ITER_SLOTS, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + offset, MAX_LFS_FILESIZE, 0, false); + break; + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + max(offset, bkey_start_offset(k.k) << 9), + k.k->p.offset << 9, 0, false); + + if (next_hole < k.k->p.offset << 9) + break; + } else { + offset = max(offset, bkey_start_offset(k.k) << 9); + } + } + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + if (ret) + return ret; + + if (next_hole > isize) + next_hole = isize; + + return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); +} + +loff_t bch2_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t ret; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + ret = generic_file_llseek(file, offset, whence); + break; + case SEEK_DATA: + ret = bch2_seek_data(file, offset); + break; + case SEEK_HOLE: + ret = bch2_seek_hole(file, offset); + break; + default: + ret = -EINVAL; + break; + } + + return bch2_err_class(ret); +} + +void bch2_fs_fsio_exit(struct bch_fs *c) +{ + bioset_exit(&c->nocow_flush_bioset); +} + +int bch2_fs_fsio_init(struct bch_fs *c) +{ + if (bioset_init(&c->nocow_flush_bioset, + 1, offsetof(struct nocow_flush, bio), 0)) + return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h new file mode 100644 index 000000000000..ca70346e68dc --- /dev/null +++ b/fs/bcachefs/fs-io.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_H +#define _BCACHEFS_FS_IO_H + +#ifndef NO_BCACHEFS_FS + +#include "buckets.h" +#include "fs.h" +#include "io_write_types.h" +#include "quota.h" + +#include <linux/uio.h> + +struct folio_vec { + struct folio *fv_folio; + size_t fv_offset; + size_t fv_len; +}; + +static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) +{ + + struct folio *folio = page_folio(bv.bv_page); + size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + + bv.bv_offset; + size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); + + return (struct folio_vec) { + .fv_folio = folio, + .fv_offset = offset, + .fv_len = len, + }; +} + +static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, + struct bvec_iter iter) +{ + return biovec_to_foliovec(bio_iter_iovec(bio, iter)); +} + +#define __bio_for_each_folio(bvl, bio, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ + bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) + +/** + * bio_for_each_folio - iterate over folios within a bio + * + * Like other non-_all versions, this iterates over what bio->bi_iter currently + * points to. This version is for drivers, where the bio may have previously + * been split or cloned. + */ +#define bio_for_each_folio(bvl, bio, iter) \ + __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) + +struct quota_res { + u64 sectors; +}; + +#ifdef CONFIG_BCACHEFS_QUOTA + +static inline void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + BUG_ON(res->sectors > inode->ei_quota_reserved); + + bch2_quota_acct(c, inode->ei_qid, Q_SPC, + -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); + inode->ei_quota_reserved -= res->sectors; + res->sectors = 0; +} + +static inline void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + if (res->sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_quota_reservation_put(c, inode, res); + mutex_unlock(&inode->ei_quota_lock); + } +} + +static inline int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + u64 sectors, + bool check_enospc) +{ + int ret; + + if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) + return 0; + + mutex_lock(&inode->ei_quota_lock); + ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, + check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); + if (likely(!ret)) { + inode->ei_quota_reserved += sectors; + res->sectors += sectors; + } + mutex_unlock(&inode->ei_quota_lock); + + return ret; +} + +#else + +static inline void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} + +static inline void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} + +static inline int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + unsigned sectors, + bool check_enospc) +{ + return 0; +} + +#endif + +void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, + struct quota_res *, s64); + +static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) +{ + if (sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, quota_res, sectors); + mutex_unlock(&inode->ei_quota_lock); + } +} + +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + +void bch2_inode_flush_nocow_writes_async(struct bch_fs *, + struct bch_inode_info *, struct closure *); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); + +int bch2_fsync(struct file *, loff_t, loff_t, int); + +int bchfs_truncate(struct mnt_idmap *, + struct bch_inode_info *, struct iattr *); +long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); + +loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, + loff_t, loff_t, unsigned); + +loff_t bch2_llseek(struct file *, loff_t, int); + +void bch2_fs_fsio_exit(struct bch_fs *); +int bch2_fs_fsio_init(struct bch_fs *); +#else +static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} +static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c new file mode 100644 index 000000000000..a70b7a03057d --- /dev/null +++ b/fs/bcachefs/fs-ioctl.c @@ -0,0 +1,572 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "chardev.h" +#include "dirent.h" +#include "fs.h" +#include "fs-common.h" +#include "fs-ioctl.h" +#include "quota.h" + +#include <linux/compat.h> +#include <linux/fsnotify.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/security.h> +#include <linux/writeback.h> + +#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) +#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +struct flags_set { + unsigned mask; + unsigned flags; + + unsigned projid; + + bool set_projinherit; + bool projinherit; +}; + +static int bch2_inode_flags_set(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + /* + * We're relying on btree locking here for exclusion with other ioctl + * calls - use the flags in the btree (@bi), not inode->i_flags: + */ + struct flags_set *s = p; + unsigned newflags = s->flags; + unsigned oldflags = bi->bi_flags & s->mask; + + if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && + (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) + return -EINVAL; + + if (s->set_projinherit) { + bi->bi_fields_set &= ~(1 << Inode_opt_project); + bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); + } + + bi->bi_flags &= ~s->mask; + bi->bi_flags |= newflags; + + bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); + return 0; +} + +static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) +{ + unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); + + return put_user(flags, arg); +} + +static int bch2_ioc_setflags(struct bch_fs *c, + struct file *file, + struct bch_inode_info *inode, + void __user *arg) +{ + struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; + unsigned uflags; + int ret; + + if (get_user(uflags, (int __user *) arg)) + return -EFAULT; + + s.flags = map_flags_rev(bch_flags_to_uflags, uflags); + if (uflags) + return -EOPNOTSUPP; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(&inode->v); + if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { + ret = -EACCES; + goto setflags_out; + } + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, + ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + +setflags_out: + inode_unlock(&inode->v); + mnt_drop_write_file(file); + return ret; +} + +static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, + struct fsxattr __user *arg) +{ + struct fsxattr fa = { 0 }; + + fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); + + if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) + fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; + + fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + + return 0; +} + +static int fssetxattr_inode_update_fn(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct flags_set *s = p; + + if (s->projid != bi->bi_project) { + bi->bi_fields_set |= 1U << Inode_opt_project; + bi->bi_project = s->projid; + } + + return bch2_inode_flags_set(trans, inode, bi, p); +} + +static int bch2_ioc_fssetxattr(struct bch_fs *c, + struct file *file, + struct bch_inode_info *inode, + struct fsxattr __user *arg) +{ + struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; + struct fsxattr fa; + int ret; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + s.set_projinherit = true; + s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; + fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; + + s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); + if (fa.fsx_xflags) + return -EOPNOTSUPP; + + if (fa.fsx_projid >= U32_MAX) + return -EINVAL; + + /* + * inode fields accessible via the xattr interface are stored with a +1 + * bias, so that 0 means unset: + */ + s.projid = fa.fsx_projid + 1; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(&inode->v); + if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { + ret = -EACCES; + goto err; + } + + mutex_lock(&inode->ei_update_lock); + ret = bch2_set_projid(c, inode, fa.fsx_projid); + if (ret) + goto err_unlock; + + ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); +err_unlock: + mutex_unlock(&inode->ei_update_lock); +err: + inode_unlock(&inode->v); + mnt_drop_write_file(file); + return ret; +} + +static int bch2_reinherit_attrs_fn(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_inode_info *dir = p; + + return !bch2_reinherit_attrs(bi, &dir->ei_inode); +} + +static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + struct file *file, + struct bch_inode_info *src, + const char __user *name) +{ + struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); + struct bch_inode_info *dst; + struct inode *vinode = NULL; + char *kname = NULL; + struct qstr qstr; + int ret = 0; + subvol_inum inum; + + kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + if (!kname) + return -ENOMEM; + + ret = strncpy_from_user(kname, name, BCH_NAME_MAX); + if (unlikely(ret < 0)) + goto err1; + + qstr.len = ret; + qstr.name = kname; + + ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); + if (ret) + goto err1; + + vinode = bch2_vfs_inode_get(c, inum); + ret = PTR_ERR_OR_ZERO(vinode); + if (ret) + goto err1; + + dst = to_bch_ei(vinode); + + ret = mnt_want_write_file(file); + if (ret) + goto err2; + + bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); + + if (inode_attr_changing(src, dst, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst, + src->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err3; + } + + ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); +err3: + bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); + + /* return true if we did work */ + if (ret >= 0) + ret = !ret; + + mnt_drop_write_file(file); +err2: + iput(vinode); +err1: + kfree(kname); + + return ret; +} + +static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) +{ + u32 flags; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, arg)) + return -EFAULT; + + bch_notice(c, "shutdown by ioctl type %u", flags); + + down_write(&c->vfs_sb->s_umount); + + switch (flags) { + case FSOP_GOING_FLAGS_DEFAULT: + ret = freeze_bdev(c->vfs_sb->s_bdev); + if (ret) + goto err; + + bch2_journal_flush(&c->journal); + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + thaw_bdev(c->vfs_sb->s_bdev); + break; + + case FSOP_GOING_FLAGS_LOGFLUSH: + bch2_journal_flush(&c->journal); + fallthrough; + + case FSOP_GOING_FLAGS_NOLOGFLUSH: + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + break; + default: + ret = -EINVAL; + break; + } +err: + up_write(&c->vfs_sb->s_umount); + return ret; +} + +static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct inode *dir; + struct bch_inode_info *inode; + struct user_namespace *s_user_ns; + struct dentry *dst_dentry; + struct path src_path, dst_path; + int how = LOOKUP_FOLLOW; + int error; + subvol_inum snapshot_src = { 0 }; + unsigned lookup_flags = 0; + unsigned create_flags = BCH_CREATE_SUBVOL; + + if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| + BCH_SUBVOL_SNAPSHOT_RO)) + return -EINVAL; + + if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + (arg.src_ptr || + (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) + return -EINVAL; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + create_flags |= BCH_CREATE_SNAPSHOT; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) + create_flags |= BCH_CREATE_SNAPSHOT_RO; + + /* why do we need this lock? */ + down_read(&c->vfs_sb->s_umount); + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + sync_inodes_sb(c->vfs_sb); +retry: + if (arg.src_ptr) { + error = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.src_ptr, + how, &src_path); + if (error) + goto err1; + + if (src_path.dentry->d_sb->s_fs_info != c) { + path_put(&src_path); + error = -EXDEV; + goto err1; + } + + snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); + } + + dst_dentry = user_path_create(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + &dst_path, lookup_flags); + error = PTR_ERR_OR_ZERO(dst_dentry); + if (error) + goto err2; + + if (dst_dentry->d_sb->s_fs_info != c) { + error = -EXDEV; + goto err3; + } + + if (dst_dentry->d_inode) { + error = -EEXIST; + goto err3; + } + + dir = dst_path.dentry->d_inode; + if (IS_DEADDIR(dir)) { + error = -BCH_ERR_ENOENT_directory_dead; + goto err3; + } + + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) { + error = -EOVERFLOW; + goto err3; + } + + error = inode_permission(file_mnt_idmap(filp), + dir, MAY_WRITE | MAY_EXEC); + if (error) + goto err3; + + if (!IS_POSIXACL(dir)) + arg.mode &= ~current_umask(); + + error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); + if (error) + goto err3; + + if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + !arg.src_ptr) + snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; + + inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), + dst_dentry, arg.mode|S_IFDIR, + 0, snapshot_src, create_flags); + error = PTR_ERR_OR_ZERO(inode); + if (error) + goto err3; + + d_instantiate(dst_dentry, &inode->v); + fsnotify_mkdir(dir, dst_dentry); +err3: + done_path_create(&dst_path, dst_dentry); +err2: + if (arg.src_ptr) + path_put(&src_path); + + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +err1: + up_read(&c->vfs_sb->s_umount); + + return error; +} + +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + down_write(&c->snapshot_create_lock); + long ret = __bch2_ioctl_subvolume_create(c, filp, arg); + up_write(&c->snapshot_create_lock); + + return ret; +} + +static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct path path; + struct inode *dir; + int ret = 0; + + if (arg.flags) + return -EINVAL; + + ret = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + LOOKUP_FOLLOW, &path); + if (ret) + return ret; + + if (path.dentry->d_sb->s_fs_info != c) { + ret = -EXDEV; + goto err; + } + + dir = path.dentry->d_parent->d_inode; + + ret = __bch2_unlink(dir, path.dentry, true); + if (ret) + goto err; + + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); +err: + path_put(&path); + return ret; +} + +long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; + + switch (cmd) { + case FS_IOC_GETFLAGS: + ret = bch2_ioc_getflags(inode, (int __user *) arg); + break; + + case FS_IOC_SETFLAGS: + ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); + break; + + case FS_IOC_FSGETXATTR: + ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); + break; + + case FS_IOC_FSSETXATTR: + ret = bch2_ioc_fssetxattr(c, file, inode, + (void __user *) arg); + break; + + case BCHFS_IOC_REINHERIT_ATTRS: + ret = bch2_ioc_reinherit_attrs(c, file, inode, + (void __user *) arg); + break; + + case FS_IOC_GETVERSION: + ret = -ENOTTY; + break; + + case FS_IOC_SETVERSION: + ret = -ENOTTY; + break; + + case FS_IOC_GOINGDOWN: + ret = bch2_ioc_goingdown(c, (u32 __user *) arg); + break; + + case BCH_IOCTL_SUBVOLUME_CREATE: { + struct bch_ioctl_subvolume i; + + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_create(c, file, i); + break; + } + + case BCH_IOCTL_SUBVOLUME_DESTROY: { + struct bch_ioctl_subvolume i; + + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_destroy(c, file, i); + break; + } + + default: + ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); + break; + } + + return bch2_err_class(ret); +} + +#ifdef CONFIG_COMPAT +long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case FS_IOC_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h new file mode 100644 index 000000000000..d30f9bb056fd --- /dev/null +++ b/fs/bcachefs/fs-ioctl.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IOCTL_H +#define _BCACHEFS_FS_IOCTL_H + +/* Inode flags: */ + +/* bcachefs inode flags -> vfs inode flags: */ +static const __maybe_unused unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, +}; + +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const __maybe_unused unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const __maybe_unused unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_sync] = FS_XFLAG_SYNC, + [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_append] = FS_XFLAG_APPEND, + [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, + [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, + //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; +}; + +#define set_flags(_map, _in, _out) \ +do { \ + unsigned _i; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & (1 << _i)) \ + (_out) |= _map[_i]; \ + else \ + (_out) &= ~_map[_i]; \ +} while (0) + +#define map_flags(_map, _in) \ +({ \ + unsigned _out = 0; \ + \ + set_flags(_map, _in, _out); \ + _out; \ +}) + +#define map_flags_rev(_map, _in) \ +({ \ + unsigned _i, _out = 0; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & _map[_i]) { \ + (_out) |= 1 << _i; \ + (_in) &= ~_map[_i]; \ + } \ + (_out); \ +}) + +#define map_defined(_map) \ +({ \ + unsigned _in = ~0; \ + \ + map_flags_rev(_map, _in); \ +}) + +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +{ + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); +} + +long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); +long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); + +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 index 000000000000..ba93e32d7708 --- /dev/null +++ b/fs/bcachefs/fs.c @@ -0,0 +1,1982 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "acl.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "chardev.h" +#include "dirent.h" +#include "errcode.h" +#include "extents.h" +#include "fs.h" +#include "fs-common.h" +#include "fs-io.h" +#include "fs-ioctl.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" +#include "fsck.h" +#include "inode.h" +#include "io_read.h" +#include "journal.h" +#include "keylist.h" +#include "quota.h" +#include "snapshot.h" +#include "super.h" +#include "xattr.h" + +#include <linux/aio.h> +#include <linux/backing-dev.h> +#include <linux/exportfs.h> +#include <linux/fiemap.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/posix_acl.h> +#include <linux/random.h> +#include <linux/seq_file.h> +#include <linux/statfs.h> +#include <linux/string.h> +#include <linux/xattr.h> + +static struct kmem_cache *bch2_inode_cache; + +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, + struct bch_inode_info *, + struct bch_inode_unpacked *, + struct bch_subvolume *); + +void bch2_inode_update_after_write(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + unsigned fields) +{ + struct bch_fs *c = trans->c; + + BUG_ON(bi->bi_inum != inode->v.i_ino); + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), + c->opts.inodes_use_key_cache); + + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); + i_uid_write(&inode->v, bi->bi_uid); + i_gid_write(&inode->v, bi->bi_gid); + inode->v.i_mode = bi->bi_mode; + + if (fields & ATTR_ATIME) + inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); + if (fields & ATTR_MTIME) + inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); + if (fields & ATTR_CTIME) + inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); + + inode->ei_inode = *bi; + + bch2_inode_flags_to_vfs(inode); +} + +int __must_check bch2_write_inode(struct bch_fs *c, + struct bch_inode_info *inode, + inode_set_fn set, + void *p, unsigned fields) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; + struct bch_inode_unpacked inode_u; + int ret; +retry: + bch2_trans_begin(trans); + + ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT) ?: + (set ? set(trans, inode, &inode_u, p) : 0) ?: + bch2_inode_write(trans, &iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); + + /* + * the btree node lock protects inode->ei_inode, not ei_update_lock; + * this is important for inode updates via bchfs_write_index_update + */ + if (!ret) + bch2_inode_update_after_write(trans, inode, &inode_u, fields); + + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, + "inode %u:%llu not found when updating", + inode_inum(inode).subvol, + inode_inum(inode).inum); + + bch2_trans_put(trans); + return ret < 0 ? ret : 0; +} + +int bch2_fs_quota_transfer(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_qid new_qid, + unsigned qtypes, + enum quota_acct_mode mode) +{ + unsigned i; + int ret; + + qtypes &= enabled_qtypes(c); + + for (i = 0; i < QTYP_NR; i++) + if (new_qid.q[i] == inode->ei_qid.q[i]) + qtypes &= ~(1U << i); + + if (!qtypes) + return 0; + + mutex_lock(&inode->ei_quota_lock); + + ret = bch2_quota_transfer(c, qtypes, new_qid, + inode->ei_qid, + inode->v.i_blocks + + inode->ei_quota_reserved, + mode); + if (!ret) + for (i = 0; i < QTYP_NR; i++) + if (qtypes & (1 << i)) + inode->ei_qid.q[i] = new_qid.q[i]; + + mutex_unlock(&inode->ei_quota_lock); + + return ret; +} + +static int bch2_iget5_test(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + return inode->ei_subvol == inum->subvol && + inode->ei_inode.bi_inum == inum->inum; +} + +static int bch2_iget5_set(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + inode->v.i_ino = inum->inum; + inode->ei_subvol = inum->subvol; + inode->ei_inode.bi_inum = inum->inum; + return 0; +} + +static unsigned bch2_inode_hash(subvol_inum inum) +{ + return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_unpacked inode_u; + struct bch_inode_info *inode; + struct btree_trans *trans; + struct bch_subvolume subvol; + int ret; + + inode = to_bch_ei(iget5_locked(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->v.i_state & I_NEW)) + return &inode->v; + + trans = bch2_trans_get(c); + ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); + + if (!ret) + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + bch2_trans_put(trans); + + if (ret) { + iget_failed(&inode->v); + return ERR_PTR(bch2_err_class(ret)); + } + + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + + unlock_new_inode(&inode->v); + + return &inode->v; +} + +struct bch_inode_info * +__bch2_create(struct mnt_idmap *idmap, + struct bch_inode_info *dir, struct dentry *dentry, + umode_t mode, dev_t rdev, subvol_inum snapshot_src, + unsigned flags) +{ + struct bch_fs *c = dir->v.i_sb->s_fs_info; + struct btree_trans *trans; + struct bch_inode_unpacked dir_u; + struct bch_inode_info *inode, *old; + struct bch_inode_unpacked inode_u; + struct posix_acl *default_acl = NULL, *acl = NULL; + subvol_inum inum; + struct bch_subvolume subvol; + u64 journal_seq = 0; + int ret; + + /* + * preallocate acls + vfs inode before btree transaction, so that + * nothing can fail after the transaction succeeds: + */ +#ifdef CONFIG_BCACHEFS_POSIX_ACL + ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); + if (ret) + return ERR_PTR(ret); +#endif + inode = to_bch_ei(new_inode(c->vfs_sb)); + if (unlikely(!inode)) { + inode = ERR_PTR(-ENOMEM); + goto err; + } + + bch2_inode_init_early(c, &inode_u); + + if (!(flags & BCH_CREATE_TMPFILE)) + mutex_lock(&dir->ei_update_lock); + + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_create_trans(trans, + inode_inum(dir), &dir_u, &inode_u, + !(flags & BCH_CREATE_TMPFILE) + ? &dentry->d_name : NULL, + from_kuid(i_user_ns(&dir->v), current_fsuid()), + from_kgid(i_user_ns(&dir->v), current_fsgid()), + mode, rdev, + default_acl, acl, snapshot_src, flags) ?: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_before_quota; + + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + ret = bch2_subvolume_get(trans, inum.subvol, true, + BTREE_ITER_WITH_UPDATES, &subvol) ?: + bch2_trans_commit(trans, NULL, &journal_seq, 0); + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +err_before_quota: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + goto err_trans; + } + + if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&dir->ei_update_lock); + } + + bch2_iget5_set(&inode->v, &inum); + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); + + /* + * we must insert the new inode into the inode cache before calling + * bch2_trans_exit() and dropping locks, else we could race with another + * thread pulling the inode in and modifying it: + */ + + inode->v.i_state |= I_CREATING; + + old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + BUG_ON(!old); + + if (unlikely(old != inode)) { + /* + * We raced, another process pulled the new inode into cache + * before us: + */ + make_bad_inode(&inode->v); + iput(&inode->v); + + inode = old; + } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); + } + + bch2_trans_put(trans); +err: + posix_acl_release(default_acl); + posix_acl_release(acl); + return inode; +err_trans: + if (!(flags & BCH_CREATE_TMPFILE)) + mutex_unlock(&dir->ei_update_lock); + + bch2_trans_put(trans); + make_bad_inode(&inode->v); + iput(&inode->v); + inode = ERR_PTR(ret); + goto err; +} + +/* methods */ + +static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, + unsigned int flags) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); + struct inode *vinode = NULL; + subvol_inum inum = { .subvol = 1 }; + int ret; + + ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, + &dentry->d_name, &inum); + + if (!ret) + vinode = bch2_vfs_inode_get(c, inum); + + return d_splice_alias(vinode, dentry); +} + +static int bch2_mknod(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct bch_inode_info *inode = + __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, + (subvol_inum) { 0 }, 0); + + if (IS_ERR(inode)) + return bch2_err_class(PTR_ERR(inode)); + + d_instantiate(dentry, &inode->v); + return 0; +} + +static int bch2_create(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); +} + +static int __bch2_link(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_inode_info *dir, + struct dentry *dentry) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct bch_inode_unpacked dir_u, inode_u; + int ret; + + mutex_lock(&inode->ei_update_lock); + + ret = commit_do(trans, NULL, NULL, 0, + bch2_link_trans(trans, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, + &dentry->d_name)); + + if (likely(!ret)) { + bch2_inode_update_after_write(trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); + } + + bch2_trans_put(trans); + mutex_unlock(&inode->ei_update_lock); + return ret; +} + +static int bch2_link(struct dentry *old_dentry, struct inode *vdir, + struct dentry *dentry) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); + int ret; + + lockdep_assert_held(&inode->v.i_rwsem); + + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) + return ret; + + ihold(&inode->v); + d_instantiate(dentry, &inode->v); + return 0; +} + +int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + bool deleting_snapshot) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_unpacked dir_u, inode_u; + struct btree_trans *trans = bch2_trans_get(c); + int ret; + + bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); + + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(trans, + inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name, + deleting_snapshot)); + if (unlikely(ret)) + goto err; + + bch2_inode_update_after_write(trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(trans, inode, &inode_u, + ATTR_MTIME); + + if (inode_u.bi_subvol) { + /* + * Subvolume deletion is asynchronous, but we still want to tell + * the VFS that it's been deleted here: + */ + set_nlink(&inode->v, 0); + } +err: + bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); + bch2_trans_put(trans); + + return ret; +} + +static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +{ + return __bch2_unlink(vdir, dentry, false); +} + +static int bch2_symlink(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, + const char *symname) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir), *inode; + int ret; + + inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); + if (IS_ERR(inode)) + return bch2_err_class(PTR_ERR(inode)); + + inode_lock(&inode->v); + ret = page_symlink(&inode->v, symname, strlen(symname) + 1); + inode_unlock(&inode->v); + + if (unlikely(ret)) + goto err; + + ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); + if (unlikely(ret)) + goto err; + + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) + goto err; + + d_instantiate(dentry, &inode->v); + return 0; +err: + iput(&inode->v); + return ret; +} + +static int bch2_mkdir(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, umode_t mode) +{ + return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); +} + +static int bch2_rename2(struct mnt_idmap *idmap, + struct inode *src_vdir, struct dentry *src_dentry, + struct inode *dst_vdir, struct dentry *dst_dentry, + unsigned flags) +{ + struct bch_fs *c = src_vdir->i_sb->s_fs_info; + struct bch_inode_info *src_dir = to_bch_ei(src_vdir); + struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); + struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); + struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); + struct bch_inode_unpacked dst_dir_u, src_dir_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct btree_trans *trans; + enum bch_rename_mode mode = flags & RENAME_EXCHANGE + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode + ? BCH_RENAME_OVERWRITE : BCH_RENAME; + int ret; + + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + return -EINVAL; + + if (mode == BCH_RENAME_OVERWRITE) { + ret = filemap_write_and_wait_range(src_inode->v.i_mapping, + 0, LLONG_MAX); + if (ret) + return ret; + } + + trans = bch2_trans_get(c); + + bch2_lock_inodes(INODE_UPDATE_LOCK, + src_dir, + dst_dir, + src_inode, + dst_inode); + + if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, src_inode, + dst_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst_inode, + src_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + + ret = commit_do(trans, NULL, NULL, 0, + bch2_rename_trans(trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode)); + if (unlikely(ret)) + goto err; + + BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); + BUG_ON(dst_inode && + dst_inode->v.i_ino != dst_inode_u.bi_inum); + + bch2_inode_update_after_write(trans, src_dir, &src_dir_u, + ATTR_MTIME|ATTR_CTIME); + + if (src_dir != dst_dir) + bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, + ATTR_MTIME|ATTR_CTIME); + + bch2_inode_update_after_write(trans, src_inode, &src_inode_u, + ATTR_CTIME); + + if (dst_inode) + bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, + ATTR_CTIME); +err: + bch2_trans_put(trans); + + bch2_fs_quota_transfer(c, src_inode, + bch_qid(&src_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + if (dst_inode) + bch2_fs_quota_transfer(c, dst_inode, + bch_qid(&dst_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + + bch2_unlock_inodes(INODE_UPDATE_LOCK, + src_dir, + dst_dir, + src_inode, + dst_inode); + + return ret; +} + +static void bch2_setattr_copy(struct mnt_idmap *idmap, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + struct iattr *attr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + if (ia_valid & ATTR_GID) + bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + + if (ia_valid & ATTR_SIZE) + bi->bi_size = attr->ia_size; + + if (ia_valid & ATTR_ATIME) + bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) + bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); + + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + kgid_t gid = ia_valid & ATTR_GID + ? attr->ia_gid + : inode->v.i_gid; + + if (!in_group_p(gid) && + !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) + mode &= ~S_ISGID; + bi->bi_mode = mode; + } +} + +int bch2_setattr_nonsize(struct mnt_idmap *idmap, + struct bch_inode_info *inode, + struct iattr *attr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_qid qid; + struct btree_trans *trans; + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl = NULL; + int ret; + + mutex_lock(&inode->ei_update_lock); + + qid = inode->ei_qid; + + if (attr->ia_valid & ATTR_UID) + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + + if (attr->ia_valid & ATTR_GID) + qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + + ret = bch2_fs_quota_transfer(c, inode, qid, ~0, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + kfree(acl); + acl = NULL; + + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); + if (ret) + goto btree_err; + + bch2_setattr_copy(idmap, inode, &inode_u, attr); + + if (attr->ia_valid & ATTR_MODE) { + ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u, + inode_u.bi_mode, &acl); + if (ret) + goto btree_err; + } + + ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +btree_err: + bch2_trans_iter_exit(trans, &inode_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) + goto err_trans; + + bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); + + if (acl) + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); +err_trans: + bch2_trans_put(trans); +err: + mutex_unlock(&inode->ei_update_lock); + + return bch2_err_class(ret); +} + +static int bch2_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned query_flags) +{ + struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + stat->dev = inode->v.i_sb->s_dev; + stat->ino = inode->v.i_ino; + stat->mode = inode->v.i_mode; + stat->nlink = inode->v.i_nlink; + stat->uid = inode->v.i_uid; + stat->gid = inode->v.i_gid; + stat->rdev = inode->v.i_rdev; + stat->size = i_size_read(&inode->v); + stat->atime = inode_get_atime(&inode->v); + stat->mtime = inode_get_mtime(&inode->v); + stat->ctime = inode_get_ctime(&inode->v); + stat->blksize = block_bytes(c); + stat->blocks = inode->v.i_blocks; + + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); + } + + if (inode->ei_inode.bi_flags & BCH_INODE_immutable) + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= STATX_ATTR_IMMUTABLE; + + if (inode->ei_inode.bi_flags & BCH_INODE_append) + stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= STATX_ATTR_APPEND; + + if (inode->ei_inode.bi_flags & BCH_INODE_nodump) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= STATX_ATTR_NODUMP; + + return 0; +} + +static int bch2_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *iattr) +{ + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + int ret; + + lockdep_assert_held(&inode->v.i_rwsem); + + ret = setattr_prepare(idmap, dentry, iattr); + if (ret) + return ret; + + return iattr->ia_valid & ATTR_SIZE + ? bchfs_truncate(idmap, inode, iattr) + : bch2_setattr_nonsize(idmap, inode, iattr); +} + +static int bch2_tmpfile(struct mnt_idmap *idmap, + struct inode *vdir, struct file *file, umode_t mode) +{ + struct bch_inode_info *inode = + __bch2_create(idmap, to_bch_ei(vdir), + file->f_path.dentry, mode, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); + + if (IS_ERR(inode)) + return bch2_err_class(PTR_ERR(inode)); + + d_mark_tmpfile(file, &inode->v); + d_instantiate(file->f_path.dentry, &inode->v); + return finish_open_simple(file, 0); +} + +static int bch2_fill_extent(struct bch_fs *c, + struct fiemap_extent_info *info, + struct bkey_s_c k, unsigned flags) +{ + if (bkey_extent_is_direct_data(k.k)) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + int ret; + + if (k.k->type == KEY_TYPE_reflink_v) + flags |= FIEMAP_EXTENT_SHARED; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int flags2 = 0; + u64 offset = p.ptr.offset; + + if (p.ptr.unwritten) + flags2 |= FIEMAP_EXTENT_UNWRITTEN; + + if (p.crc.compression_type) + flags2 |= FIEMAP_EXTENT_ENCODED; + else + offset += p.crc.offset; + + if ((offset & (block_sectors(c) - 1)) || + (k.k->size & (block_sectors(c) - 1))) + flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; + + ret = fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + offset << 9, + k.k->size << 9, flags|flags2); + if (ret) + return ret; + } + + return 0; + } else if (bkey_extent_is_inline_data(k.k)) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, + flags| + FIEMAP_EXTENT_DATA_INLINE); + } else if (k.k->type == KEY_TYPE_reservation) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, + flags| + FIEMAP_EXTENT_DELALLOC| + FIEMAP_EXTENT_UNWRITTEN); + } else { + BUG(); + } +} + +static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + u64 start, u64 len) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); + unsigned offset_into_extent, sectors; + bool have_extent = false; + u32 snapshot; + int ret = 0; + + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + if (start + len < start) + return -EINVAL; + + start >>= 9; + + bch2_bkey_buf_init(&cur); + bch2_bkey_buf_init(&prev); + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + + while (!(ret = btree_trans_too_many_iters(trans)) && + (k = bch2_btree_iter_peek_upto(&iter, end)).k && + !(ret = bkey_err(k))) { + enum btree_id data_btree = BTREE_ID_extents; + + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { + bch2_btree_iter_advance(&iter); + continue; + } + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &cur); + if (ret) + break; + + k = bkey_i_to_s_c(cur.k); + bch2_bkey_buf_realloc(&prev, c, k.k->u64s); + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + + offset_into_extent), + cur.k); + bch2_key_resize(&cur.k->k, sectors); + cur.k->k.p = iter.pos; + cur.k->k.p.offset += cur.k->k.size; + + if (have_extent) { + bch2_trans_unlock(trans); + ret = bch2_fill_extent(c, info, + bkey_i_to_s_c(prev.k), 0); + if (ret) + break; + } + + bkey_copy(prev.k, cur.k); + have_extent = true; + + bch2_btree_iter_set_pos(&iter, + POS(iter.pos.inode, iter.pos.offset + sectors)); + } + start = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (!ret && have_extent) { + bch2_trans_unlock(trans); + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), + FIEMAP_EXTENT_LAST); + } + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + return ret < 0 ? ret : 0; +} + +static const struct vm_operations_struct bch_vm_ops = { + .fault = bch2_page_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = bch2_page_mkwrite, +}; + +static int bch2_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + + vma->vm_ops = &bch_vm_ops; + return 0; +} + +/* Directories: */ + +static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_file_llseek_size(file, offset, whence, + S64_MAX, S64_MAX); +} + +static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret; + + if (!dir_emit_dots(file, ctx)) + return 0; + + ret = bch2_readdir(c, inode_inum(inode), ctx); + if (ret) + bch_err_fn(c, ret); + + return bch2_err_class(ret); +} + +static const struct file_operations bch_file_operations = { + .llseek = bch2_llseek, + .read_iter = bch2_read_iter, + .write_iter = bch2_write_iter, + .mmap = bch2_mmap, + .open = generic_file_open, + .fsync = bch2_fsync, + .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = bch2_fallocate_dispatch, + .unlocked_ioctl = bch2_fs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch2_compat_fs_ioctl, +#endif + .remap_file_range = bch2_remap_file_range, +}; + +static const struct inode_operations bch_file_inode_operations = { + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .fiemap = bch2_fiemap, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct inode_operations bch_dir_inode_operations = { + .lookup = bch2_lookup, + .create = bch2_create, + .link = bch2_link, + .unlink = bch2_unlink, + .symlink = bch2_symlink, + .mkdir = bch2_mkdir, + .rmdir = bch2_unlink, + .mknod = bch2_mknod, + .rename = bch2_rename2, + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .tmpfile = bch2_tmpfile, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct file_operations bch_dir_file_operations = { + .llseek = bch2_dir_llseek, + .read = generic_read_dir, + .iterate_shared = bch2_vfs_readdir, + .fsync = bch2_fsync, + .unlocked_ioctl = bch2_fs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch2_compat_fs_ioctl, +#endif +}; + +static const struct inode_operations bch_symlink_inode_operations = { + .get_link = page_get_link, + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct inode_operations bch_special_inode_operations = { + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct address_space_operations bch_address_space_operations = { + .read_folio = bch2_read_folio, + .writepages = bch2_writepages, + .readahead = bch2_readahead, + .dirty_folio = filemap_dirty_folio, + .write_begin = bch2_write_begin, + .write_end = bch2_write_end, + .invalidate_folio = bch2_invalidate_folio, + .release_folio = bch2_release_folio, + .direct_IO = noop_direct_IO, +#ifdef CONFIG_MIGRATION + .migrate_folio = filemap_migrate_folio, +#endif + .error_remove_page = generic_error_remove_page, +}; + +struct bcachefs_fid { + u64 inum; + u32 subvol; + u32 gen; +} __packed; + +struct bcachefs_fid_with_parent { + struct bcachefs_fid fid; + struct bcachefs_fid dir; +} __packed; + +static int bcachefs_fid_valid(int fh_len, int fh_type) +{ + switch (fh_type) { + case FILEID_BCACHEFS_WITHOUT_PARENT: + return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); + case FILEID_BCACHEFS_WITH_PARENT: + return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); + default: + return false; + } +} + +static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) +{ + return (struct bcachefs_fid) { + .inum = inode->ei_inode.bi_inum, + .subvol = inode->ei_subvol, + .gen = inode->ei_inode.bi_generation, + }; +} + +static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, + struct inode *vdir) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_inode_info *dir = to_bch_ei(vdir); + int min_len; + + if (!S_ISDIR(inode->v.i_mode) && dir) { + struct bcachefs_fid_with_parent *fid = (void *) fh; + + min_len = sizeof(*fid) / sizeof(u32); + if (*len < min_len) { + *len = min_len; + return FILEID_INVALID; + } + + fid->fid = bch2_inode_to_fid(inode); + fid->dir = bch2_inode_to_fid(dir); + + *len = min_len; + return FILEID_BCACHEFS_WITH_PARENT; + } else { + struct bcachefs_fid *fid = (void *) fh; + + min_len = sizeof(*fid) / sizeof(u32); + if (*len < min_len) { + *len = min_len; + return FILEID_INVALID; + } + *fid = bch2_inode_to_fid(inode); + + *len = min_len; + return FILEID_BCACHEFS_WITHOUT_PARENT; + } +} + +static struct inode *bch2_nfs_get_inode(struct super_block *sb, + struct bcachefs_fid fid) +{ + struct bch_fs *c = sb->s_fs_info; + struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { + .subvol = fid.subvol, + .inum = fid.inum, + }); + if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { + iput(vinode); + vinode = ERR_PTR(-ESTALE); + } + return vinode; +} + +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, + int fh_len, int fh_type) +{ + struct bcachefs_fid *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type)) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); +} + +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, + int fh_len, int fh_type) +{ + struct bcachefs_fid_with_parent *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type) || + fh_type != FILEID_BCACHEFS_WITH_PARENT) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); +} + +static struct dentry *bch2_get_parent(struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + subvol_inum parent_inum = { + .subvol = inode->ei_inode.bi_parent_subvol ?: + inode->ei_subvol, + .inum = inode->ei_inode.bi_dir, + }; + + return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); +} + +static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_inode_info *dir = to_bch_ei(parent->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans *trans; + struct btree_iter iter1; + struct btree_iter iter2; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct bch_inode_unpacked inode_u; + subvol_inum target; + u32 snapshot; + struct qstr dirent_name; + unsigned name_len = 0; + int ret; + + if (!S_ISDIR(dir->v.i_mode)) + return -EINVAL; + + trans = bch2_trans_get(c); + + bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); + bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter1, snapshot); + bch2_btree_iter_set_snapshot(&iter2, snapshot); + + ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); + if (ret) + goto err; + + if (inode_u.bi_dir == dir->ei_inode.bi_inum) { + bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + + k = bch2_btree_iter_peek_slot(&iter1); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_dirent) { + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + goto err; + } + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); + if (ret > 0) + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + if (ret) + goto err; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } else { + /* + * File with multiple hardlinks and our backref is to the wrong + * directory - linear search: + */ + for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + if (k.k->p.inode > dir->ei_inode.bi_inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); + if (ret < 0) + break; + if (ret) + continue; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } + } + + ret = -ENOENT; + goto err; +found: + dirent_name = bch2_dirent_get_name(d); + + name_len = min_t(unsigned, dirent_name.len, NAME_MAX); + memcpy(name, dirent_name.name, name_len); + name[name_len] = '\0'; +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(trans, &iter1); + bch2_trans_iter_exit(trans, &iter2); + bch2_trans_put(trans); + + return ret; +} + +static const struct export_operations bch_export_ops = { + .encode_fh = bch2_encode_fh, + .fh_to_dentry = bch2_fh_to_dentry, + .fh_to_parent = bch2_fh_to_parent, + .get_parent = bch2_get_parent, + .get_name = bch2_get_name, +}; + +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) +{ + bch2_inode_update_after_write(trans, inode, bi, ~0); + + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + else + clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + + inode->v.i_blocks = bi->bi_sectors; + inode->v.i_ino = bi->bi_inum; + inode->v.i_rdev = bi->bi_dev; + inode->v.i_generation = bi->bi_generation; + inode->v.i_size = bi->bi_size; + + inode->ei_flags = 0; + inode->ei_quota_reserved = 0; + inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; + + inode->v.i_mapping->a_ops = &bch_address_space_operations; + + switch (inode->v.i_mode & S_IFMT) { + case S_IFREG: + inode->v.i_op = &bch_file_inode_operations; + inode->v.i_fop = &bch_file_operations; + break; + case S_IFDIR: + inode->v.i_op = &bch_dir_inode_operations; + inode->v.i_fop = &bch_dir_file_operations; + break; + case S_IFLNK: + inode_nohighmem(&inode->v); + inode->v.i_op = &bch_symlink_inode_operations; + break; + default: + init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); + inode->v.i_op = &bch_special_inode_operations; + break; + } + + mapping_set_large_folios(inode->v.i_mapping); +} + +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + struct bch_inode_info *inode; + + inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + if (!inode) + return NULL; + + inode_init_once(&inode->v); + mutex_init(&inode->ei_update_lock); + two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + mutex_init(&inode->ei_quota_lock); + + return &inode->v; +} + +static void bch2_i_callback(struct rcu_head *head) +{ + struct inode *vinode = container_of(head, struct inode, i_rcu); + struct bch_inode_info *inode = to_bch_ei(vinode); + + kmem_cache_free(bch2_inode_cache, inode); +} + +static void bch2_destroy_inode(struct inode *vinode) +{ + call_rcu(&vinode->i_rcu, bch2_i_callback); +} + +static int inode_update_times_fn(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); + bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); + bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); + + return 0; +} + +static int bch2_vfs_write_inode(struct inode *vinode, + struct writeback_control *wbc) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(vinode); + int ret; + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + + return bch2_err_class(ret); +} + +static void bch2_evict_inode(struct inode *vinode) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(vinode); + + truncate_inode_pages_final(&inode->v.i_data); + + clear_inode(&inode->v); + + BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); + + if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { + bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); + bch2_inode_rm(c, inode_inum(inode)); + } + + mutex_lock(&c->vfs_inodes_lock); + list_del_init(&inode->ei_vfs_inode_list); + mutex_unlock(&c->vfs_inodes_lock); +} + +void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) +{ + struct bch_inode_info *inode, **i; + DARRAY(struct bch_inode_info *) grabbed; + bool clean_pass = false, this_pass_clean; + + /* + * Initially, we scan for inodes without I_DONTCACHE, then mark them to + * be pruned with d_mark_dontcache(). + * + * Once we've had a clean pass where we didn't find any inodes without + * I_DONTCACHE, we wait for them to be freed: + */ + + darray_init(&grabbed); + darray_make_room(&grabbed, 1024); +again: + cond_resched(); + this_pass_clean = true; + + mutex_lock(&c->vfs_inodes_lock); + list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { + if (!snapshot_list_has_id(s, inode->ei_subvol)) + continue; + + if (!(inode->v.i_state & I_DONTCACHE) && + !(inode->v.i_state & I_FREEING) && + igrab(&inode->v)) { + this_pass_clean = false; + + if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { + iput(&inode->v); + break; + } + } else if (clean_pass && this_pass_clean) { + wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->vfs_inodes_lock); + + schedule(); + finish_wait(wq, &wait.wq_entry); + goto again; + } + } + mutex_unlock(&c->vfs_inodes_lock); + + darray_for_each(grabbed, i) { + inode = *i; + d_mark_dontcache(&inode->v); + d_prune_aliases(&inode->v); + iput(&inode->v); + } + grabbed.nr = 0; + + if (!clean_pass || !this_pass_clean) { + clean_pass = this_pass_clean; + goto again; + } + + darray_exit(&grabbed); +} + +static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct bch_fs *c = sb->s_fs_info; + struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); + unsigned shift = sb->s_blocksize_bits - 9; + /* + * this assumes inodes take up 64 bytes, which is a decent average + * number: + */ + u64 avail_inodes = ((usage.capacity - usage.used) << 3); + u64 fsid; + + buf->f_type = BCACHEFS_STATFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = usage.capacity >> shift; + buf->f_bfree = usage.free >> shift; + buf->f_bavail = avail_factor(usage.free) >> shift; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; + + fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ + le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = BCH_NAME_MAX; + + return 0; +} + +static int bch2_sync_fs(struct super_block *sb, int wait) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + if (c->opts.journal_flush_disabled) + return 0; + + if (!wait) { + bch2_journal_flush_async(&c->journal, NULL); + return 0; + } + + ret = bch2_journal_flush(&c->journal); + return bch2_err_class(ret); +} + +static struct bch_fs *bch2_path_to_fs(const char *path) +{ + struct bch_fs *c; + dev_t dev; + int ret; + + ret = lookup_bdev(path, &dev); + if (ret) + return ERR_PTR(ret); + + c = bch2_dev_to_fs(dev); + if (c) + closure_put(&c->cl); + return c ?: ERR_PTR(-ENOENT); +} + +static char **split_devs(const char *_dev_name, unsigned *nr) +{ + char *dev_name = NULL, **devs = NULL, *s; + size_t i = 0, nr_devs = 0; + + dev_name = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return NULL; + + for (s = dev_name; s; s = strchr(s + 1, ':')) + nr_devs++; + + devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); + if (!devs) { + kfree(dev_name); + return NULL; + } + + while ((s = strsep(&dev_name, ":"))) + devs[i++] = s; + + *nr = nr_devs; + return devs; +} + +static int bch2_remount(struct super_block *sb, int *flags, char *data) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_opts opts = bch2_opts_empty(); + int ret; + + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); + + ret = bch2_parse_mount_opts(c, &opts, data); + if (ret) + goto err; + + if (opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts.read_only; + + up_write(&c->state_lock); + } + + if (opt_defined(opts, errors)) + c->opts.errors = opts.errors; +err: + return bch2_err_class(ret); +} + +static int bch2_show_devname(struct seq_file *seq, struct dentry *root) +{ + struct bch_fs *c = root->d_sb->s_fs_info; + struct bch_dev *ca; + unsigned i; + bool first = true; + + for_each_online_member(ca, c, i) { + if (!first) + seq_putc(seq, ':'); + first = false; + seq_puts(seq, ca->disk_sb.sb_name); + } + + return 0; +} + +static int bch2_show_options(struct seq_file *seq, struct dentry *root) +{ + struct bch_fs *c = root->d_sb->s_fs_info; + enum bch_opt_id i; + struct printbuf buf = PRINTBUF; + int ret = 0; + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + + if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + printbuf_reset(&buf); + bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, + OPT_SHOW_MOUNT_STYLE); + seq_putc(seq, ','); + seq_puts(seq, buf.buf); + } + + if (buf.allocation_failure) + ret = -ENOMEM; + printbuf_exit(&buf); + return ret; +} + +static void bch2_put_super(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + __bch2_fs_stop(c); +} + +/* + * bcachefs doesn't currently integrate intwrite freeze protection but the + * internal write references serve the same purpose. Therefore reuse the + * read-only transition code to perform the quiesce. The caveat is that we don't + * currently have the ability to block tasks that want a write reference while + * the superblock is frozen. This is fine for now, but we should either add + * blocking support or find a way to integrate sb_start_intwrite() and friends. + */ +static int bch2_freeze(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); + return 0; +} + +static int bch2_unfreeze(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + return 0; + + down_write(&c->state_lock); + ret = bch2_fs_read_write(c); + up_write(&c->state_lock); + return ret; +} + +static const struct super_operations bch_super_operations = { + .alloc_inode = bch2_alloc_inode, + .destroy_inode = bch2_destroy_inode, + .write_inode = bch2_vfs_write_inode, + .evict_inode = bch2_evict_inode, + .sync_fs = bch2_sync_fs, + .statfs = bch2_statfs, + .show_devname = bch2_show_devname, + .show_options = bch2_show_options, + .remount_fs = bch2_remount, + .put_super = bch2_put_super, + .freeze_fs = bch2_freeze, + .unfreeze_fs = bch2_unfreeze, +}; + +static int bch2_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + return 0; +} + +static int bch2_noset_super(struct super_block *s, void *data) +{ + return -EBUSY; +} + +static int bch2_test_super(struct super_block *s, void *data) +{ + struct bch_fs *c = s->s_fs_info; + struct bch_fs **devs = data; + unsigned i; + + if (!c) + return false; + + for (i = 0; devs[i]; i++) + if (c != devs[i]) + return false; + return true; +} + +static struct dentry *bch2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct bch_fs *c; + struct bch_dev *ca; + struct super_block *sb; + struct inode *vinode; + struct bch_opts opts = bch2_opts_empty(); + char **devs; + struct bch_fs **devs_to_fs = NULL; + unsigned i, nr_devs; + int ret; + + opt_set(opts, read_only, (flags & SB_RDONLY) != 0); + + ret = bch2_parse_mount_opts(NULL, &opts, data); + if (ret) + return ERR_PTR(ret); + + if (!dev_name || strlen(dev_name) == 0) + return ERR_PTR(-EINVAL); + + devs = split_devs(dev_name, &nr_devs); + if (!devs) + return ERR_PTR(-ENOMEM); + + devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); + if (!devs_to_fs) { + sb = ERR_PTR(-ENOMEM); + goto got_sb; + } + + for (i = 0; i < nr_devs; i++) + devs_to_fs[i] = bch2_path_to_fs(devs[i]); + + sb = sget(fs_type, bch2_test_super, bch2_noset_super, + flags|SB_NOSEC, devs_to_fs); + if (!IS_ERR(sb)) + goto got_sb; + + c = bch2_fs_open(devs, nr_devs, opts); + if (IS_ERR(c)) { + sb = ERR_CAST(c); + goto got_sb; + } + + /* Some options can't be parsed until after the fs is started: */ + ret = bch2_parse_mount_opts(c, &opts, data); + if (ret) { + bch2_fs_stop(c); + sb = ERR_PTR(ret); + goto got_sb; + } + + bch2_opts_apply(&c->opts, opts); + + sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); + if (IS_ERR(sb)) + bch2_fs_stop(c); +got_sb: + kfree(devs_to_fs); + kfree(devs[0]); + kfree(devs); + + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + ret = bch2_err_class(ret); + return ERR_PTR(ret); + } + + c = sb->s_fs_info; + + if (sb->s_root) { + if ((flags ^ sb->s_flags) & SB_RDONLY) { + ret = -EBUSY; + goto err_put_super; + } + goto out; + } + + sb->s_blocksize = block_bytes(c); + sb->s_blocksize_bits = ilog2(block_bytes(c)); + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &bch_super_operations; + sb->s_export_op = &bch_export_ops; +#ifdef CONFIG_BCACHEFS_QUOTA + sb->s_qcop = &bch2_quotactl_operations; + sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; +#endif + sb->s_xattr = bch2_xattr_handlers; + sb->s_magic = BCACHEFS_STATFS_MAGIC; + sb->s_time_gran = c->sb.nsec_per_time_unit; + sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; + sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + c->vfs_sb = sb; + strscpy(sb->s_id, c->name, sizeof(sb->s_id)); + + ret = super_setup_bdi(sb); + if (ret) + goto err_put_super; + + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; + + for_each_online_member(ca, c, i) { + struct block_device *bdev = ca->disk_sb.bdev; + + /* XXX: create an anonymous device for multi device filesystems */ + sb->s_bdev = bdev; + sb->s_dev = bdev->bd_dev; + percpu_ref_put(&ca->io_ref); + break; + } + + c->dev = sb->s_dev; + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + if (c->opts.acl) + sb->s_flags |= SB_POSIXACL; +#endif + + sb->s_shrink->seeks = 0; + + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); + ret = PTR_ERR_OR_ZERO(vinode); + if (ret) { + bch_err_msg(c, ret, "mounting: error getting root inode"); + goto err_put_super; + } + + sb->s_root = d_make_root(vinode); + if (!sb->s_root) { + bch_err(c, "error mounting: error allocating root dentry"); + ret = -ENOMEM; + goto err_put_super; + } + + sb->s_flags |= SB_ACTIVE; +out: + return dget(sb->s_root); + +err_put_super: + deactivate_locked_super(sb); + return ERR_PTR(bch2_err_class(ret)); +} + +static void bch2_kill_sb(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + generic_shutdown_super(sb); + bch2_fs_free(c); +} + +static struct file_system_type bcache_fs_type = { + .owner = THIS_MODULE, + .name = "bcachefs", + .mount = bch2_mount, + .kill_sb = bch2_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +MODULE_ALIAS_FS("bcachefs"); + +void bch2_vfs_exit(void) +{ + unregister_filesystem(&bcache_fs_type); + kmem_cache_destroy(bch2_inode_cache); +} + +int __init bch2_vfs_init(void) +{ + int ret = -ENOMEM; + + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + if (!bch2_inode_cache) + goto err; + + ret = register_filesystem(&bcache_fs_type); + if (ret) + goto err; + + return 0; +err: + bch2_vfs_exit(); + return ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 index 000000000000..5edf1d4b9e6b --- /dev/null +++ b/fs/bcachefs/fs.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_H +#define _BCACHEFS_FS_H + +#include "inode.h" +#include "opts.h" +#include "str_hash.h" +#include "quota_types.h" +#include "two_state_shared_lock.h" + +#include <linux/seqlock.h> +#include <linux/stat.h> + +struct bch_inode_info { + struct inode v; + struct list_head ei_vfs_inode_list; + unsigned long ei_flags; + + struct mutex ei_update_lock; + u64 ei_quota_reserved; + unsigned long ei_last_dirtied; + two_state_lock_t ei_pagecache_lock; + + struct mutex ei_quota_lock; + struct bch_qid ei_qid; + + u32 ei_subvol; + + /* + * When we've been doing nocow writes we'll need to issue flushes to the + * underlying block devices + * + * XXX: a device may have had a flush issued by some other codepath. It + * would be better to keep for each device a sequence number that's + * incremented when we isusue a cache flush, and track here the sequence + * number that needs flushing. + */ + struct bch_devs_mask ei_devs_need_flush; + + /* copy of inode in btree: */ + struct bch_inode_unpacked ei_inode; +}; + +#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) + +#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) +#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) + +static inline subvol_inum inode_inum(struct bch_inode_info *inode) +{ + return (subvol_inum) { + .subvol = inode->ei_subvol, + .inum = inode->ei_inode.bi_inum, + }; +} + +/* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: + */ +#define EI_INODE_ERROR 0 + +/* + * Set in the inode is in a snapshot subvolume - we don't do quota accounting in + * those: + */ +#define EI_INODE_SNAPSHOT 1 + +#define to_bch_ei(_inode) \ + container_of_or_null(_inode, struct bch_inode_info, v) + +static inline int ptrcmp(void *l, void *r) +{ + return cmp_int(l, r); +} + +enum bch_inode_lock_op { + INODE_LOCK = (1U << 0), + INODE_PAGECACHE_BLOCK = (1U << 1), + INODE_UPDATE_LOCK = (1U << 2), +}; + +#define bch2_lock_inodes(_locks, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ + \ + for (i = 1; i < ARRAY_SIZE(a); i++) \ + if (a[i] != a[i - 1]) { \ + if ((_locks) & INODE_LOCK) \ + down_write_nested(&a[i]->v.i_rwsem, i); \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_get(a[i]);\ + if ((_locks) & INODE_UPDATE_LOCK) \ + mutex_lock_nested(&a[i]->ei_update_lock, i);\ + } \ +} while (0) + +#define bch2_unlock_inodes(_locks, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ + \ + for (i = 1; i < ARRAY_SIZE(a); i++) \ + if (a[i] != a[i - 1]) { \ + if ((_locks) & INODE_LOCK) \ + up_write(&a[i]->v.i_rwsem); \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_put(a[i]);\ + if ((_locks) & INODE_UPDATE_LOCK) \ + mutex_unlock(&a[i]->ei_update_lock); \ + } \ +} while (0) + +static inline struct bch_inode_info *file_bch_inode(struct file *file) +{ + return to_bch_ei(file_inode(file)); +} + +static inline bool inode_attr_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode, + enum inode_opt_id id) +{ + return !(inode->ei_inode.bi_fields_set & (1 << id)) && + bch2_inode_opt_get(&dir->ei_inode, id) != + bch2_inode_opt_get(&inode->ei_inode, id); +} + +static inline bool inode_attrs_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode) +{ + unsigned id; + + for (id = 0; id < Inode_opt_nr; id++) + if (inode_attr_changing(dir, inode, id)) + return true; + + return false; +} + +struct bch_inode_unpacked; + +#ifndef NO_BCACHEFS_FS + +struct bch_inode_info * +__bch2_create(struct mnt_idmap *, struct bch_inode_info *, + struct dentry *, umode_t, dev_t, subvol_inum, unsigned); + +int bch2_fs_quota_transfer(struct bch_fs *, + struct bch_inode_info *, + struct bch_qid, + unsigned, + enum quota_acct_mode); + +static inline int bch2_set_projid(struct bch_fs *c, + struct bch_inode_info *inode, + u32 projid) +{ + struct bch_qid qid = inode->ei_qid; + + qid.q[QTYP_PRJ] = projid; + + return bch2_fs_quota_transfer(c, inode, qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); + +/* returns 0 if we want to do the update, or error is passed up */ +typedef int (*inode_set_fn)(struct btree_trans *, + struct bch_inode_info *, + struct bch_inode_unpacked *, void *); + +void bch2_inode_update_after_write(struct btree_trans *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + unsigned); +int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *, unsigned); + +int bch2_setattr_nonsize(struct mnt_idmap *, + struct bch_inode_info *, + struct iattr *); +int __bch2_unlink(struct inode *, struct dentry *, bool); + +void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); + +void bch2_vfs_exit(void); +int bch2_vfs_init(void); + +#else + +#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) + +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, + snapshot_id_list *s) {} +static inline void bch2_vfs_exit(void) {} +static inline int bch2_vfs_init(void) { return 0; } + +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 index 000000000000..e0c5cd119acc --- /dev/null +++ b/fs/bcachefs/fsck.c @@ -0,0 +1,2490 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "buckets.h" +#include "darray.h" +#include "dirent.h" +#include "error.h" +#include "fs-common.h" +#include "fsck.h" +#include "inode.h" +#include "keylist.h" +#include "recovery.h" +#include "snapshot.h" +#include "super.h" +#include "xattr.h" + +#include <linux/bsearch.h> +#include <linux/dcache.h> /* struct qstr */ + +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +/* + * XXX: this is handling transaction restarts without returning + * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: + */ +static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 sectors = 0; + int ret; + + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) + if (bkey_extent_is_allocation(k.k)) + sectors += k.k->size; + + bch2_trans_iter_exit(trans, &iter); + + return ret ?: sectors; +} + +static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u64 subdirs = 0; + int ret; + + for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) { + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_DIR) + subdirs++; + } + bch2_trans_iter_exit(trans, &iter); + + return ret ?: subdirs; +} + +static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + u32 *subvol) +{ + struct bch_snapshot s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, + POS(0, snapshot), 0, + snapshot, &s); + if (!ret) + *subvol = le32_to_cpu(s.subvol); + else if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot %u not found", snapshot); + return ret; + +} + +static int __subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + struct bch_subvolume s; + int ret; + + ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + + *snapshot = le32_to_cpu(s.snapshot); + *inum = le64_to_cpu(s.inode); + return ret; +} + +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); +} + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + POS(0, inode_nr), + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, inode); +err: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, *snapshot), 0); + ret = bkey_err(k); + if (ret) + goto err; + + ret = bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) + : -BCH_ERR_ENOENT_inode; + if (!ret) + *snapshot = iter.pos.snapshot; +err: + bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); +} + +static int __lookup_dirent(struct btree_trans *trans, + struct bch_hash_info hash_info, + subvol_inum dir, struct qstr *name, + u64 *target, unsigned *type) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0); + if (ret) + return ret; + + d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + *target = le64_to_cpu(d.v->d_inum); + *type = d.v->d_type; + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +static int __write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); + + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static int fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + int ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __write_inode(trans, inode, snapshot)); + if (ret) + bch_err_fn(trans->c, ret); + return ret; +} + +static int __remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter); +err: + bch_err_fn(c, ret); + return ret; +} + +/* Get lost+found, create if it doesn't exist: */ +static int lookup_lostfound(struct btree_trans *trans, u32 subvol, + struct bch_inode_unpacked *lostfound) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root; + struct bch_hash_info root_hash_info; + struct qstr lostfound_str = QSTR("lost+found"); + subvol_inum root_inum = { .subvol = subvol }; + u64 inum = 0; + unsigned d_type = 0; + u32 snapshot; + int ret; + + ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + if (ret) + return ret; + + ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); + if (ret) + return ret; + + root_hash_info = bch2_hash_info_init(c, &root); + + ret = __lookup_dirent(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type); + if (bch2_err_matches(ret, ENOENT)) { + bch_notice(c, "creating lost+found"); + goto create_lostfound; + } + + bch_err_fn(c, ret); + if (ret) + return ret; + + if (d_type != DT_DIR) { + bch_err(c, "error looking up lost+found: not a directory"); + return -BCH_ERR_ENOENT_not_directory; + } + + /* + * The bch2_check_dirents pass has already run, dangling dirents + * shouldn't exist here: + */ + return __lookup_inode(trans, inum, lostfound, &snapshot); + +create_lostfound: + bch2_inode_init_early(c, lostfound); + + ret = bch2_create_trans(trans, root_inum, &root, + lostfound, &lostfound_str, + 0, 0, S_IFDIR|0700, 0, NULL, NULL, + (subvol_inum) { }, 0); + bch_err_msg(c, ret, "creating lost+found"); + return ret; +} + +static int __reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) +{ + struct bch_hash_info dir_hash; + struct bch_inode_unpacked lostfound; + char name_buf[20]; + struct qstr name; + u64 dir_offset = 0; + u32 subvol; + int ret; + + ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); + if (ret) + return ret; + + ret = lookup_lostfound(trans, subvol, &lostfound); + if (ret) + return ret; + + if (S_ISDIR(inode->bi_mode)) { + lostfound.bi_nlink++; + + ret = __write_inode(trans, &lostfound, U32_MAX); + if (ret) + return ret; + } + + dir_hash = bch2_hash_info_init(trans->c, &lostfound); + + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + name = (struct qstr) QSTR(name_buf); + + ret = bch2_dirent_create(trans, + (subvol_inum) { + .subvol = subvol, + .inum = lostfound.bi_inum, + }, + &dir_hash, + inode_d_type(inode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + return ret; + + inode->bi_dir = lostfound.bi_inum; + inode->bi_dir_offset = dir_offset; + + return __write_inode(trans, inode, inode_snapshot); +} + +static int reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) +{ + int ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + __reattach_inode(trans, inode, inode_snapshot)); + bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum); + return ret; +} + +static int remove_backpointer(struct btree_trans *trans, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, + POS(inode->bi_dir, inode->bi_dir_offset), 0, + dirent); + ret = bkey_err(d) ?: + __remove_dirent(trans, d.k->p); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +struct snapshots_seen_entry { + u32 id; + u32 equiv; +}; + +struct snapshots_seen { + struct bpos pos; + DARRAY(struct snapshots_seen_entry) ids; +}; + +static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ + darray_exit(&s->ids); +} + +static inline void snapshots_seen_init(struct snapshots_seen *s) +{ + memset(s, 0, sizeof(*s)); +} + +static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ + struct snapshots_seen_entry *i, n = { + .id = id, + .equiv = bch2_snapshot_equiv(c, id), + }; + int ret = 0; + + darray_for_each(s->ids, i) { + if (i->id == id) + return 0; + if (i->id > id) + break; + } + + ret = darray_insert_item(&s->ids, i - s->ids.data, n); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); + return ret; +} + +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, + enum btree_id btree_id, struct bpos pos) +{ + struct snapshots_seen_entry *i, n = { + .id = pos.snapshot, + .equiv = bch2_snapshot_equiv(c, pos.snapshot), + }; + int ret = 0; + + if (!bkey_eq(s->pos, pos)) + s->ids.nr = 0; + + s->pos = pos; + s->pos.snapshot = n.equiv; + + darray_for_each(s->ids, i) { + if (i->id == n.id) + return 0; + + /* + * We currently don't rigorously track for snapshot cleanup + * needing to be run, so it shouldn't be a fsck error yet: + */ + if (i->equiv == n.equiv) { + bch_err(c, "snapshot deletion did not finish:\n" + " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", + bch2_btree_id_str(btree_id), + pos.inode, pos.offset, + i->id, n.id, n.equiv); + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); + } + } + + ret = darray_push(&s->ids, n); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); + return ret; +} + +/** + * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, + * and @ancestor hasn't been overwritten in @seen + * + * @c: filesystem handle + * @seen: list of snapshot ids already seen at current position + * @id: descendent snapshot id + * @ancestor: ancestor snapshot id + * + * Returns: whether key in @ancestor snapshot is visible in @id snapshot + */ +static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, + u32 id, u32 ancestor) +{ + ssize_t i; + + EBUG_ON(id > ancestor); + EBUG_ON(!bch2_snapshot_is_equiv(c, id)); + EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); + + /* @ancestor should be the snapshot most recently added to @seen */ + EBUG_ON(ancestor != seen->pos.snapshot); + EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); + + if (id == ancestor) + return true; + + if (!bch2_snapshot_is_ancestor(c, id, ancestor)) + return false; + + /* + * We know that @id is a descendant of @ancestor, we're checking if + * we've seen a key that overwrote @ancestor - i.e. also a descendent of + * @ascestor and with @id as a descendent. + * + * But we already know that we're scanning IDs between @id and @ancestor + * numerically, since snapshot ID lists are kept sorted, so if we find + * an id that's an ancestor of @id we're done: + */ + + for (i = seen->ids.nr - 2; + i >= 0 && seen->ids.data[i].equiv >= id; + --i) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) + return false; + + return true; +} + +/** + * ref_visible - given a key with snapshot id @src that points to a key with + * snapshot id @dst, test whether there is some snapshot in which @dst is + * visible. + * + * @c: filesystem handle + * @s: list of snapshot IDs already seen at @src + * @src: snapshot ID of src key + * @dst: snapshot ID of dst key + * Returns: true if there is some snapshot in which @dst is visible + * + * Assumes we're visiting @src keys in natural key order + */ +static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s, + u32 src, u32 dst) +{ + return dst <= src + ? key_visible_in_snapshot(c, s, dst, src) + : bch2_snapshot_is_ancestor(c, src, dst); +} + +static int ref_visible2(struct bch_fs *c, + u32 src, struct snapshots_seen *src_seen, + u32 dst, struct snapshots_seen *dst_seen) +{ + src = bch2_snapshot_equiv(c, src); + dst = bch2_snapshot_equiv(c, dst); + + if (dst > src) { + swap(dst, src); + swap(dst_seen, src_seen); + } + return key_visible_in_snapshot(c, src_seen, dst, src); +} + +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ + for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ + (_i)->snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + +struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + bool seen_this_pos; + u64 count; +}; + +struct inode_walker { + bool first_this_inode; + bool recalculate_sums; + struct bpos last_pos; + + DARRAY(struct inode_walker_entry) inodes; +}; + +static void inode_walker_exit(struct inode_walker *w) +{ + darray_exit(&w->inodes); +} + +static struct inode_walker inode_walker_init(void) +{ + return (struct inode_walker) { 0, }; +} + +static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c inode) +{ + struct bch_inode_unpacked u; + + BUG_ON(bch2_inode_unpack(inode, &u)); + + return darray_push(&w->inodes, ((struct inode_walker_entry) { + .inode = u, + .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + })); +} + +static int get_inodes_all_snapshots(struct btree_trans *trans, + struct inode_walker *w, u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + u32 restart_count = trans->restart_count; + int ret; + + w->recalculate_sums = false; + w->inodes.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != inum) + break; + + if (bkey_is_inode(k.k)) + add_inode(c, w, k); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + w->first_this_inode = true; + + return trans_was_restarted(trans, restart_count); +} + +static struct inode_walker_entry * +lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, + u32 snapshot, bool is_whiteout) +{ + struct inode_walker_entry *i; + + snapshot = bch2_snapshot_equiv(c, snapshot); + + darray_for_each(w->inodes, i) + if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) + goto found; + + return NULL; +found: + BUG_ON(snapshot > i->snapshot); + + if (snapshot != i->snapshot && !is_whiteout) { + struct inode_walker_entry new = *i; + size_t pos; + int ret; + + new.snapshot = snapshot; + new.count = 0; + + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", + w->last_pos.inode, snapshot, i->snapshot); + + while (i > w->inodes.data && i[-1].snapshot > snapshot) + --i; + + pos = i - w->inodes.data; + ret = darray_insert_item(&w->inodes, pos, new); + if (ret) + return ERR_PTR(ret); + + i = w->inodes.data + pos; + } + + return i; +} + +static struct inode_walker_entry *walk_inode(struct btree_trans *trans, + struct inode_walker *w, struct bpos pos, + bool is_whiteout) +{ + if (w->last_pos.inode != pos.inode) { + int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (ret) + return ERR_PTR(ret); + } else if (bkey_cmp(w->last_pos, pos)) { + struct inode_walker_entry *i; + + darray_for_each(w->inodes, i) + i->seen_this_pos = false; + + } + + w->last_pos = pos; + + return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); +} + +static int __get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + w->inodes.nr = 0; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + if (k.k->p.offset != inum) + break; + + if (!ref_visible(c, s, s->pos.snapshot, equiv)) + continue; + + if (bkey_is_inode(k.k)) + add_inode(c, w, k); + + if (equiv >= s->pos.snapshot) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, + bkey_in_missing_snapshot, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int hash_redo_key(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k) +{ + struct bkey_i *delete; + struct bkey_i *tmp; + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + if (IS_ERR(delete)) + return PTR_ERR(delete); + + tmp = bch2_bkey_make_mut_noupdate(trans, k); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + bkey_init(&delete->k); + delete->k.p = k_iter->pos; + return bch2_btree_iter_traverse(k_iter) ?: + bch2_trans_update(trans, k_iter, delete, 0) ?: + bch2_hash_set_snapshot(trans, desc, hash_info, + (subvol_inum) { 0, k.k->p.inode }, + k.k->p.snapshot, tmp, + BCH_HASH_SET_MUST_CREATE, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); +} + +static int hash_check_key(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + u64 hash; + int ret = 0; + + if (hash_k.k->type != desc.key_type) + return 0; + + hash = desc.hash_bkey(hash_info, hash_k); + + if (likely(hash == hash_k.k->p.offset)) + return 0; + + if (hash_k.k->p.offset < hash) + goto bad_hash; + + for_each_btree_key_norestart(trans, iter, desc.btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_SLOTS, k, ret) { + if (bkey_eq(k.k->p, hash_k.k->p)) + break; + + if (fsck_err_on(k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k), c, + hash_table_key_duplicate, + "duplicate hash table keys:\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + buf.buf))) { + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; + break; + } + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + goto bad_hash; + } + } +out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +bad_hash: + if (fsck_err(c, hash_table_key_wrong_offset, + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", + bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); + bch_err_fn(c, ret); + if (ret) + return ret; + ret = -BCH_ERR_transaction_restart_nested; + } +fsck_err: + goto out; +} + +static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_inode_unpacked *prev, + struct snapshots_seen *s, + bool full) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + bool do_update = false; + int ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret < 0) + goto err; + if (ret) + return 0; + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) + return 0; + + BUG_ON(bch2_inode_unpack(k, &u)); + + if (!full && + !(u.bi_flags & (BCH_INODE_i_size_dirty| + BCH_INODE_i_sectors_dirty| + BCH_INODE_unlinked))) + return 0; + + if (prev->bi_inum != u.bi_inum) + *prev = u; + + if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || + inode_d_type(prev) != inode_d_type(&u), + c, inode_snapshot_mismatch, + "inodes in different snapshots don't match")) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + + if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) && + bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { + struct bpos new_min_pos; + + ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); + if (ret) + goto err; + + u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; + + ret = __write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); + if (ret) + return ret; + + if (!bpos_eq(new_min_pos, POS_MIN)) + bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); + return 0; + } + + if (u.bi_flags & BCH_INODE_unlinked && + (!c->sb.clean || + fsck_err(c, inode_unlinked_but_clean, + "filesystem marked clean, but inode %llu unlinked", + u.bi_inum))) { + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); + + ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck deleting inode"); + return ret; + } + + if (u.bi_flags & BCH_INODE_i_size_dirty && + (!c->sb.clean || + fsck_err(c, inode_i_size_dirty_but_clean, + "filesystem marked clean, but inode %llu has i_size dirty", + u.bi_inum))) { + bch_verbose(c, "truncating inode %llu", u.bi_inum); + + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); + + /* + * XXX: need to truncate partial blocks too here - or ideally + * just switch units to bytes and that issue goes away + */ + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, + iter->pos.snapshot), + POS(u.bi_inum, U64_MAX), + 0, NULL); + bch_err_msg(c, ret, "in fsck truncating inode"); + if (ret) + return ret; + + /* + * We truncated without our normal sector accounting hook, just + * make sure we recalculate it: + */ + u.bi_flags |= BCH_INODE_i_sectors_dirty; + + u.bi_flags &= ~BCH_INODE_i_size_dirty; + do_update = true; + } + + if (u.bi_flags & BCH_INODE_i_sectors_dirty && + (!c->sb.clean || + fsck_err(c, inode_i_sectors_dirty_but_clean, + "filesystem marked clean, but inode %llu has i_sectors dirty", + u.bi_inum))) { + s64 sectors; + + bch_verbose(c, "recounting sectors for inode %llu", + u.bi_inum); + + sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); + if (sectors < 0) { + bch_err_msg(c, sectors, "in fsck recounting inode sectors"); + return sectors; + } + + u.bi_sectors = sectors; + u.bi_flags &= ~BCH_INODE_i_sectors_dirty; + do_update = true; + } + + if (u.bi_flags & BCH_INODE_backptr_untrusted) { + u.bi_dir = 0; + u.bi_dir_offset = 0; + u.bi_flags &= ~BCH_INODE_backptr_untrusted; + do_update = true; + } + + if (do_update) { + ret = __write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); + if (ret) + return ret; + } +err: +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +noinline_for_stack +int bch2_check_inodes(struct bch_fs *c) +{ + bool full = c->opts.fsck; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bch_inode_unpacked prev = { 0 }; + struct snapshots_seen s; + struct bkey_s_c k; + int ret; + + snapshots_seen_init(&s); + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_inode(trans, &iter, k, &prev, &s, full)); + + snapshots_seen_exit(&s); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + +static int inode_backpointer_exists(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + d = dirent_get_by_pos(trans, &iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); + ret = bkey_err(d); + if (ret) + return bch2_err_matches(ret, ENOENT) ? 0 : ret; + + ret = dirent_points_to_inode(d, inode); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + u32 restart_count = trans->restart_count; + int ret = 0; + s64 count2; + + darray_for_each(w->inodes, i) { + if (i->inode.bi_sectors == i->count) + continue; + + count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); + + if (w->recalculate_sums) + i->count = count2; + + if (i->count != count2) { + bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); + return -BCH_ERR_internal_fsck_err; + } + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), + c, inode_i_sectors_wrong, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->last_pos.inode, i->snapshot, + i->inode.bi_sectors, i->count)) { + i->inode.bi_sectors = i->count; + ret = fsck_write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + } + } +fsck_err: + bch_err_fn(c, ret); + return ret ?: trans_was_restarted(trans, restart_count); +} + +struct extent_end { + u32 snapshot; + u64 offset; + struct snapshots_seen seen; +}; + +struct extent_ends { + struct bpos last_pos; + DARRAY(struct extent_end) e; +}; + +static void extent_ends_reset(struct extent_ends *extent_ends) +{ + struct extent_end *i; + + darray_for_each(extent_ends->e, i) + snapshots_seen_exit(&i->seen); + + extent_ends->e.nr = 0; +} + +static void extent_ends_exit(struct extent_ends *extent_ends) +{ + extent_ends_reset(extent_ends); + darray_exit(&extent_ends->e); +} + +static void extent_ends_init(struct extent_ends *extent_ends) +{ + memset(extent_ends, 0, sizeof(*extent_ends)); +} + +static int extent_ends_at(struct bch_fs *c, + struct extent_ends *extent_ends, + struct snapshots_seen *seen, + struct bkey_s_c k) +{ + struct extent_end *i, n = (struct extent_end) { + .offset = k.k->p.offset, + .snapshot = k.k->p.snapshot, + .seen = *seen, + }; + + n.seen.ids.data = kmemdup(seen->ids.data, + sizeof(seen->ids.data[0]) * seen->ids.size, + GFP_KERNEL); + if (!n.seen.ids.data) + return -BCH_ERR_ENOMEM_fsck_extent_ends_at; + + darray_for_each(extent_ends->e, i) { + if (i->snapshot == k.k->p.snapshot) { + snapshots_seen_exit(&i->seen); + *i = n; + return 0; + } + + if (i->snapshot >= k.k->p.snapshot) + break; + } + + return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); +} + +static int overlapping_extents_found(struct btree_trans *trans, + enum btree_id btree, + struct bpos pos1, struct snapshots_seen *pos1_seen, + struct bkey pos2, + bool *fixed, + struct extent_end *extent_end) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter iter1, iter2 = { NULL }; + struct bkey_s_c k1, k2; + int ret; + + BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); + + bch2_trans_iter_init(trans, &iter1, btree, pos1, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOT_EXTENTS); + k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k1); + if (ret) + goto err; + + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k1); + + if (!bpos_eq(pos1, k1.k->p)) { + prt_str(&buf, "\n wanted\n "); + bch2_bpos_to_text(&buf, pos1); + prt_str(&buf, "\n "); + bch2_bkey_to_text(&buf, &pos2); + + bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", + __func__, buf.buf); + ret = -BCH_ERR_internal_fsck_err; + goto err; + } + + bch2_trans_copy_iter(&iter2, &iter1); + + while (1) { + bch2_btree_iter_advance(&iter2); + + k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k2); + if (ret) + goto err; + + if (bpos_ge(k2.k->p, pos2.p)) + break; + } + + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k2); + + if (bpos_gt(k2.k->p, pos2.p) || + pos2.size != k2.k->size) { + bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", + __func__, buf.buf); + ret = -BCH_ERR_internal_fsck_err; + goto err; + } + + prt_printf(&buf, "\n overwriting %s extent", + pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); + + if (fsck_err(c, extent_overlapping, + "overlapping extents%s", buf.buf)) { + struct btree_iter *old_iter = &iter1; + struct disk_reservation res = { 0 }; + + if (pos1.snapshot < pos2.p.snapshot) { + old_iter = &iter2; + swap(k1, k2); + } + + trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); + + ret = bch2_trans_update_extent_overwrite(trans, old_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + k1, k2) ?: + bch2_trans_commit(trans, &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &res); + + if (ret) + goto err; + + *fixed = true; + + if (pos1.snapshot == pos2.p.snapshot) { + /* + * We overwrote the first extent, and did the overwrite + * in the same snapshot: + */ + extent_end->offset = bkey_start_offset(&pos2); + } else if (pos1.snapshot > pos2.p.snapshot) { + /* + * We overwrote the first extent in pos2's snapshot: + */ + ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); + } else { + /* + * We overwrote the second extent - restart + * check_extent() from the top: + */ + ret = -BCH_ERR_transaction_restart_nested; + } + } +fsck_err: +err: + bch2_trans_iter_exit(trans, &iter2); + bch2_trans_iter_exit(trans, &iter1); + printbuf_exit(&buf); + return ret; +} + +static int check_overlapping_extents(struct btree_trans *trans, + struct snapshots_seen *seen, + struct extent_ends *extent_ends, + struct bkey_s_c k, + u32 equiv, + struct btree_iter *iter, + bool *fixed) +{ + struct bch_fs *c = trans->c; + struct extent_end *i; + int ret = 0; + + /* transaction restart, running again */ + if (bpos_eq(extent_ends->last_pos, k.k->p)) + return 0; + + if (extent_ends->last_pos.inode != k.k->p.inode) + extent_ends_reset(extent_ends); + + darray_for_each(extent_ends->e, i) { + if (i->offset <= bkey_start_offset(k.k)) + continue; + + if (!ref_visible2(c, + k.k->p.snapshot, seen, + i->snapshot, &i->seen)) + continue; + + ret = overlapping_extents_found(trans, iter->btree_id, + SPOS(iter->pos.inode, + i->offset, + i->snapshot), + &i->seen, + *k.k, fixed, i); + if (ret) + goto err; + } + + ret = extent_ends_at(c, extent_ends, seen, k); + if (ret) + goto err; + + extent_ends->last_pos = k.k->p; +err: + return ret; +} + +static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc_is_encoded(crc) && + crc.uncompressed_size > encoded_extent_max_sectors) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); + printbuf_exit(&buf); + } + + return 0; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct inode_walker *inode, + struct snapshots_seen *s, + struct extent_ends *extent_ends) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; + struct bpos equiv = k.k->p; + int ret = 0; + + equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } + + if (inode->last_pos.inode != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) + goto err; + } + + i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); + ret = PTR_ERR_OR_ZERO(i); + if (ret) + goto err; + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_whiteout) { + if (fsck_err_on(!i, c, extent_in_missing_inode, + "extent in missing inode:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; + + if (fsck_err_on(i && + !S_ISREG(i->inode.bi_mode) && + !S_ISLNK(i->inode.bi_mode), + c, extent_in_non_reg_inode, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; + + ret = check_overlapping_extents(trans, s, extent_ends, k, + equiv.snapshot, iter, + &inode->recalculate_sums); + if (ret) + goto err; + } + + /* + * Check inodes in reverse order, from oldest snapshots to newest, + * starting from the inode that matches this extent's snapshot. If we + * didn't have one, iterate over all inodes: + */ + if (!i) + i = inode->inodes.data + inode->inodes.nr - 1; + + for (; + inode->inodes.data && i >= inode->inodes.data; + --i) { + if (i->snapshot > equiv.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) + continue; + + if (k.k->type != KEY_TYPE_whiteout) { + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + !bkey_extent_is_reservation(k), + c, extent_past_end_of_inode, + "extent type past end of inode %llu:%u, i_size %llu\n %s", + i->inode.bi_inum, i->snapshot, i->inode.bi_size, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct btree_iter iter2; + + bch2_trans_copy_iter(&iter2, iter); + bch2_btree_iter_set_snapshot(&iter2, i->snapshot); + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_btree_delete_at(trans, &iter2, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter2); + if (ret) + goto err; + + iter->k.type = KEY_TYPE_whiteout; + } + + if (bkey_extent_is_allocation(k.k)) + i->count += k.k->size; + } + + i->seen_this_pos = true; + } +out: +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +delete: + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; +} + +/* + * Walk extents: verify that extents have a corresponding S_ISREG inode, and + * that i_size an i_sectors are consistent + */ +int bch2_check_extents(struct bch_fs *c) +{ + struct inode_walker w = inode_walker_init(); + struct snapshots_seen s; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct extent_ends extent_ends; + struct disk_reservation res = { 0 }; + int ret = 0; + + snapshots_seen_init(&s); + extent_ends_init(&extent_ends); + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ + bch2_disk_reservation_put(c, &res); + check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: + check_extent_overbig(trans, &iter, k); + })) ?: + check_i_sectors(trans, &w); + + bch2_disk_reservation_put(c, &res); + extent_ends_exit(&extent_ends); + inode_walker_exit(&w); + snapshots_seen_exit(&s); + bch2_trans_put(trans); + + bch_err_fn(c, ret); + return ret; +} + +int bch2_check_indirect_extents(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct disk_reservation res = { 0 }; + int ret = 0; + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + POS_MIN, + BTREE_ITER_PREFETCH, k, + &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ + bch2_disk_reservation_put(c, &res); + check_extent_overbig(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + bch2_trans_put(trans); + + bch_err_fn(c, ret); + return ret; +} + +static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + u32 restart_count = trans->restart_count; + int ret = 0; + s64 count2; + + darray_for_each(w->inodes, i) { + if (i->inode.bi_nlink == i->count) + continue; + + count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); + if (count2 < 0) + return count2; + + if (i->count != count2) { + bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_nlink == i->count) + continue; + } + + if (fsck_err_on(i->inode.bi_nlink != i->count, + c, inode_dir_wrong_nlink, + "directory %llu:%u with wrong i_nlink: got %u, should be %llu", + w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { + i->inode.bi_nlink = i->count; + ret = fsck_write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + } + } +fsck_err: + bch_err_fn(c, ret); + return ret ?: trans_was_restarted(trans, restart_count); +} + +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + bool backpointer_exists = true; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + + if (!inode_points_to_dirent(target, d)) { + ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); + if (ret < 0) + goto err; + + backpointer_exists = ret; + ret = 0; + + if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, + c, inode_dir_multiple_links, + "directory %llu with multiple links", + target->bi_inum)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } + + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + c, inode_multiple_links_but_nlink_0, + "inode %llu type %s has multiple links but i_nlink 0", + target->bi_inum, bch2_d_types[d.v->d_type])) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + + if (fsck_err_on(!backpointer_exists, + c, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + } + + if (fsck_err_on(d.v->d_type != inode_d_type(target), + c, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + goto err; + + d = dirent_i_to_s_c(n); + } + + if (d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && + (c->sb.version < bcachefs_metadata_version_subvol_dirent || + fsck_err(c, dirent_d_parent_subvol_wrong, + "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + goto err; + + d = dirent_i_to_s_c(n); + } +out: +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct bch_hash_info *hash_info, + struct inode_walker *dir, + struct inode_walker *target, + struct snapshots_seen *s) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_dirent d; + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; + struct bpos equiv; + int ret = 0; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } + + equiv = k.k->p; + equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + + if (k.k->type == KEY_TYPE_whiteout) + goto out; + + if (dir->last_pos.inode != k.k->p.inode) { + ret = check_subdir_count(trans, dir); + if (ret) + goto err; + } + + BUG_ON(!iter->path->should_be_locked); + + i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); + ret = PTR_ERR_OR_ZERO(i); + if (ret < 0) + goto err; + + if (dir->first_this_inode && dir->inodes.nr) + *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); + dir->first_this_inode = false; + + if (fsck_err_on(!i, c, dirent_in_missing_dir_inode, + "dirent in nonexisting directory:\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } + + if (!i) + goto out; + + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), + c, dirent_in_non_dir_inode, + "dirent in non directory inode type %s:\n%s", + bch2_d_type_str(inode_d_type(&i->inode)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto out; + } + + ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); + if (ret < 0) + goto err; + if (ret) { + /* dirent has been deleted */ + ret = 0; + goto out; + } + + if (k.k->type != KEY_TYPE_dirent) + goto out; + + d = bkey_s_c_to_dirent(k); + + if (d.v->d_type == DT_SUBVOL) { + struct bch_inode_unpacked subvol_root; + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 target_snapshot; + u64 target_inum; + + ret = __subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, c, dirent_to_missing_subvol, + "dirent points to missing subvolume %u", + le32_to_cpu(d.v->d_child_subvol))) { + ret = __remove_dirent(trans, d.k->p); + goto err; + } + + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); + ret = -EINVAL; + goto err; + } + + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, + c, subvol_root_wrong_bi_subvol, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; + ret = __write_inode(trans, &subvol_root, target_snapshot); + if (ret) + goto err; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) + goto err; + } else { + ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) + goto err; + + if (fsck_err_on(!target->inodes.nr, + c, dirent_to_missing_inode, + "dirent points to missing inode: (equiv %u)\n%s", + equiv.snapshot, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + ret = __remove_dirent(trans, d.k->p); + if (ret) + goto err; + } + + darray_for_each(target->inodes, i) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); + if (ret) + goto err; + } + } + + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, equiv.snapshot, i) + i->count++; + +out: +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +/* + * Walk dirents: verify that they all have a corresponding S_ISDIR inode, + * validate d_type + */ +int bch2_check_dirents(struct bch_fs *c) +{ + struct inode_walker dir = inode_walker_init(); + struct inode_walker target = inode_walker_init(); + struct snapshots_seen s; + struct bch_hash_info hash_info; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + snapshots_seen_init(&s); + + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)); + + bch2_trans_put(trans); + snapshots_seen_exit(&s); + inode_walker_exit(&dir); + inode_walker_exit(&target); + bch_err_fn(c, ret); + return ret; +} + +static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct bch_hash_info *hash_info, + struct inode_walker *inode) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret; + + i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); + ret = PTR_ERR_OR_ZERO(i); + if (ret) + return ret; + + if (inode->first_this_inode && inode->inodes.nr) + *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); + inode->first_this_inode = false; + + if (fsck_err_on(!i, c, xattr_in_missing_inode, + "xattr for missing inode %llu", + k.k->p.inode)) + return bch2_btree_delete_at(trans, iter, 0); + + if (!i) + return 0; + + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +/* + * Walk xattrs: verify that they all have a corresponding inode + */ +int bch2_check_xattrs(struct bch_fs *c) +{ + struct inode_walker inode = inode_walker_init(); + struct bch_hash_info hash_info; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_xattr(trans, &iter, k, &hash_info, &inode))); + bch_err_fn(c, ret); + return ret; +} + +static int check_root_trans(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root_inode; + u32 snapshot; + u64 inum; + int ret; + + ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (mustfix_fsck_err_on(ret, c, root_subvol_missing, + "root subvol missing")) { + struct bkey_i_subvolume root_subvol; + + snapshot = U32_MAX; + inum = BCACHEFS_ROOT_INO; + + bkey_subvolume_init(&root_subvol.k_i); + root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol.v.flags = 0; + root_subvol.v.snapshot = cpu_to_le32(snapshot); + root_subvol.v.inode = cpu_to_le64(inum); + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, + &root_subvol.k_i, 0)); + bch_err_msg(c, ret, "writing root subvol"); + if (ret) + goto err; + + } + + ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (mustfix_fsck_err_on(ret, c, root_dir_missing, + "root directory missing") || + mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), + c, root_inode_not_dir, + "root inode not a directory")) { + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, + 0, NULL); + root_inode.bi_inum = inum; + + ret = __write_inode(trans, &root_inode, snapshot); + bch_err_msg(c, ret, "writing root inode"); + } +err: +fsck_err: + return ret; +} + +/* Get root directory, create if it doesn't exist: */ +int bch2_check_root(struct bch_fs *c) +{ + int ret; + + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + check_root_trans(trans)); + bch_err_fn(c, ret); + return ret; +} + +struct pathbuf_entry { + u64 inum; + u32 snapshot; +}; + +typedef DARRAY(struct pathbuf_entry) pathbuf; + +static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) +{ + struct pathbuf_entry *i; + + darray_for_each(*p, i) + if (i->inum == inum && + i->snapshot == snapshot) + return true; + + return false; +} + +static int path_down(struct bch_fs *c, pathbuf *p, + u64 inum, u32 snapshot) +{ + int ret = darray_push(p, ((struct pathbuf_entry) { + .inum = inum, + .snapshot = snapshot, + })); + + if (ret) + bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", + p->size); + return ret; +} + +/* + * Check that a given inode is reachable from the root: + * + * XXX: we should also be verifying that inodes are in the right subvolumes + */ +static int check_path(struct btree_trans *trans, + pathbuf *p, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + snapshot = bch2_snapshot_equiv(c, snapshot); + p->nr = 0; + + while (!(inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + struct btree_iter dirent_iter; + struct bkey_s_c_dirent d; + u32 parent_snapshot = snapshot; + + if (inode->bi_subvol) { + u64 inum; + + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &parent_snapshot, &inum); + if (ret) + break; + } + + ret = lockrestart_do(trans, + PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, + parent_snapshot))).k)); + if (ret && !bch2_err_matches(ret, ENOENT)) + break; + + if (!ret && !dirent_points_to_inode(d, inode)) { + bch2_trans_iter_exit(trans, &dirent_iter); + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + } + + if (bch2_err_matches(ret, ENOENT)) { + if (fsck_err(c, inode_unreachable, + "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, + bch2_d_type_str(inode_d_type(inode)), + inode->bi_nlink, + inode->bi_dir, + inode->bi_dir_offset)) + ret = reattach_inode(trans, inode, snapshot); + break; + } + + bch2_trans_iter_exit(trans, &dirent_iter); + + if (!S_ISDIR(inode->bi_mode)) + break; + + ret = path_down(c, p, inode->bi_inum, snapshot); + if (ret) { + bch_err(c, "memory allocation failure"); + return ret; + } + + snapshot = parent_snapshot; + + ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + if (ret) { + /* Should have been caught in dirents pass */ + bch_err(c, "error looking up parent directory: %i", ret); + break; + } + + if (path_is_dup(p, inode->bi_inum, snapshot)) { + struct pathbuf_entry *i; + + /* XXX print path */ + bch_err(c, "directory structure loop"); + + darray_for_each(*p, i) + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode->bi_inum, snapshot); + + if (!fsck_err(c, dir_loop, + "directory structure loop")) + return 0; + + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + remove_backpointer(trans, inode)); + if (ret) { + bch_err(c, "error removing dirent: %i", ret); + break; + } + + ret = reattach_inode(trans, inode, snapshot); + } + } +fsck_err: + bch_err_fn(c, ret); + return ret; +} + +/* + * Check for unreachable inodes, as well as loops in the directory structure: + * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's + * unreachable: + */ +int bch2_check_directory_structure(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked u; + pathbuf path = { 0, }; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) + continue; + + ret = bch2_inode_unpack(k, &u); + if (ret) { + /* Should have been caught earlier in fsck: */ + bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); + break; + } + + if (u.bi_flags & BCH_INODE_unlinked) + continue; + + ret = check_path(trans, &path, &u, iter.pos.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + darray_exit(&path); + bch_err_fn(c, ret); + return ret; +} + +struct nlink_table { + size_t nr; + size_t size; + + struct nlink { + u64 inum; + u32 snapshot; + u32 count; + } *d; +}; + +static int add_nlink(struct bch_fs *c, struct nlink_table *t, + u64 inum, u32 snapshot) +{ + if (t->nr == t->size) { + size_t new_size = max_t(size_t, 128UL, t->size * 2); + void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); + + if (!d) { + bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", + new_size); + return -BCH_ERR_ENOMEM_fsck_add_nlink; + } + + if (t->d) + memcpy(d, t->d, t->size * sizeof(t->d[0])); + kvfree(t->d); + + t->d = d; + t->size = new_size; + } + + + t->d[t->nr++] = (struct nlink) { + .inum = inum, + .snapshot = snapshot, + }; + + return 0; +} + +static int nlink_cmp(const void *_l, const void *_r) +{ + const struct nlink *l = _l; + const struct nlink *r = _r; + + return cmp_int(l->inum, r->inum); +} + +static void inc_link(struct bch_fs *c, struct snapshots_seen *s, + struct nlink_table *links, + u64 range_start, u64 range_end, u64 inum, u32 snapshot) +{ + struct nlink *link, key = { + .inum = inum, .snapshot = U32_MAX, + }; + + if (inum < range_start || inum >= range_end) + return; + + link = __inline_bsearch(&key, links->d, links->nr, + sizeof(links->d[0]), nlink_cmp); + if (!link) + return; + + while (link > links->d && link[0].inum == link[-1].inum) + --link; + + for (; link < links->d + links->nr && link->inum == inum; link++) + if (ref_visible(c, s, snapshot, link->snapshot)) { + link->count++; + if (link->snapshot >= snapshot) + break; + } +} + +noinline_for_stack +static int check_nlinks_find_hardlinks(struct bch_fs *c, + struct nlink_table *t, + u64 start, u64 *end) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked u; + int ret = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) + continue; + + /* Should never fail, checked by bch2_inode_invalid: */ + BUG_ON(bch2_inode_unpack(k, &u)); + + /* + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ + if (S_ISDIR(u.bi_mode)) + continue; + + if (!u.bi_nlink) + continue; + + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; + break; + } + + } + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + + if (ret) + bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + + return ret; +} + +noinline_for_stack +static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, + u64 range_start, u64 range_end) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct snapshots_seen s; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + int ret; + + snapshots_seen_init(&s); + + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); + if (ret) + break; + + switch (k.k->type) { + case KEY_TYPE_dirent: + d = bkey_s_c_to_dirent(k); + + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + bch2_snapshot_equiv(c, d.k->p.snapshot)); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + + bch2_trans_put(trans); + snapshots_seen_exit(&s); + return ret; +} + +static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct nlink_table *links, + size_t *idx, u64 range_end) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct nlink *link = &links->d[*idx]; + int ret = 0; + + if (k.k->p.offset >= range_end) + return 1; + + if (!bkey_is_inode(k.k)) + return 0; + + BUG_ON(bch2_inode_unpack(k, &u)); + + if (S_ISDIR(u.bi_mode)) + return 0; + + if (!u.bi_nlink) + return 0; + + while ((cmp_int(link->inum, k.k->p.offset) ?: + cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { + BUG_ON(*idx == links->nr); + link = &links->d[++*idx]; + } + + if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, + c, inode_wrong_nlink, + "inode %llu type %s has wrong i_nlink (%u, should be %u)", + u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], + bch2_inode_nlink_get(&u), link->count)) { + bch2_inode_nlink_set(&u, link->count); + ret = __write_inode(trans, &u, k.k->p.snapshot); + } +fsck_err: + return ret; +} + +noinline_for_stack +static int check_nlinks_update_hardlinks(struct bch_fs *c, + struct nlink_table *links, + u64 range_start, u64 range_end) +{ + struct btree_iter iter; + struct bkey_s_c k; + size_t idx = 0; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS(0, range_start), + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); + if (ret < 0) { + bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + return ret; + } + + return 0; +} + +int bch2_check_nlinks(struct bch_fs *c) +{ + struct nlink_table links = { 0 }; + u64 this_iter_range_start, next_iter_range_start = 0; + int ret = 0; + + do { + this_iter_range_start = next_iter_range_start; + next_iter_range_start = U64_MAX; + + ret = check_nlinks_find_hardlinks(c, &links, + this_iter_range_start, + &next_iter_range_start); + + ret = check_nlinks_walk_dirents(c, &links, + this_iter_range_start, + next_iter_range_start); + if (ret) + break; + + ret = check_nlinks_update_hardlinks(c, &links, + this_iter_range_start, + next_iter_range_start); + if (ret) + break; + + links.nr = 0; + } while (next_iter_range_start != U64_MAX); + + kvfree(links.d); + bch_err_fn(c, ret); + return ret; +} + +static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p; + struct bkey_i_reflink_p *u; + int ret; + + if (k.k->type != KEY_TYPE_reflink_p) + return 0; + + p = bkey_s_c_to_reflink_p(k); + + if (!p.v->front_pad && !p.v->back_pad) + return 0; + + u = bch2_trans_kmalloc(trans, sizeof(*u)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(&u->k_i, k); + u->v.front_pad = 0; + u->v.back_pad = 0; + + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); +} + +int bch2_fix_reflink_p(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) + return 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + fix_reflink_p_key(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h new file mode 100644 index 000000000000..da991e8cf27e --- /dev/null +++ b/fs/bcachefs/fsck.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FSCK_H +#define _BCACHEFS_FSCK_H + +int bch2_check_inodes(struct bch_fs *); +int bch2_check_extents(struct bch_fs *); +int bch2_check_indirect_extents(struct bch_fs *); +int bch2_check_dirents(struct bch_fs *); +int bch2_check_xattrs(struct bch_fs *); +int bch2_check_root(struct bch_fs *); +int bch2_check_directory_structure(struct bch_fs *); +int bch2_check_nlinks(struct bch_fs *); +int bch2_fix_reflink_p(struct bch_fs *); + +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 index 000000000000..9309cfeecd8d --- /dev/null +++ b/fs/bcachefs/inode.c @@ -0,0 +1,1205 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_write_buffer.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "buckets.h" +#include "compress.h" +#include "dirent.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" +#include "inode.h" +#include "str_hash.h" +#include "snapshot.h" +#include "subvolume.h" +#include "varint.h" + +#include <linux/random.h> + +#include <asm/unaligned.h> + +#define x(name, ...) #name, +const char * const bch2_inode_opts[] = { + BCH_INODE_OPTS() + NULL, +}; + +static const char * const bch2_inode_flag_strs[] = { + BCH_INODE_FLAGS() + NULL +}; +#undef x + +static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; + +static int inode_decode_field(const u8 *in, const u8 *end, + u64 out[2], unsigned *out_bits) +{ + __be64 be[2] = { 0, 0 }; + unsigned bytes, shift; + u8 *p; + + if (in >= end) + return -1; + + if (!*in) + return -1; + + /* + * position of highest set bit indicates number of bytes: + * shift = number of bits to remove in high byte: + */ + shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ + bytes = byte_table[shift - 1]; + + if (in + bytes > end) + return -1; + + p = (u8 *) be + 16 - bytes; + memcpy(p, in, bytes); + *p ^= (1 << 8) >> shift; + + out[0] = be64_to_cpu(be[0]); + out[1] = be64_to_cpu(be[1]); + *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); + + return bytes; +} + +static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + struct bkey_i_inode_v3 *k = &packed->inode; + u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; + int ret; + + bkey_inode_v3_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); + packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); + packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); + packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); + SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); + SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); + + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (inode->_name) { \ + ret = bch2_varint_encode_fast(out, inode->_name); \ + out += ret; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + } + + BCH_INODE_FIELDS_v3() +#undef x + BUG_ON(out > end); + + out = last_nonzero_field; + nr_fields = last_nonzero_fieldnr; + + bytes = out - (u8 *) &packed->inode.v; + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + + SET_INODEv3_NR_FIELDS(&k->v, nr_fields); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bch_inode_unpacked unpacked; + + ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); + BUG_ON(ret); + BUG_ON(unpacked.bi_inum != inode->bi_inum); + BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); + BUG_ON(unpacked.bi_sectors != inode->bi_sectors); + BUG_ON(unpacked.bi_size != inode->bi_size); + BUG_ON(unpacked.bi_version != inode->bi_version); + BUG_ON(unpacked.bi_mode != inode->bi_mode); + +#define x(_name, _bits) if (unpacked._name != inode->_name) \ + panic("unpacked %llu should be %llu", \ + (u64) unpacked._name, (u64) inode->_name); + BCH_INODE_FIELDS_v3() +#undef x + } +} + +void bch2_inode_pack(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + bch2_inode_pack_inlined(packed, inode); +} + +static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + u64 field[2]; + unsigned fieldnr = 0, field_bits; + int ret; + +#define x(_name, _bits) \ + if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ + unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ + memset((void *) unpacked + offset, 0, \ + sizeof(*unpacked) - offset); \ + return 0; \ + } \ + \ + ret = inode_decode_field(in, end, field, &field_bits); \ + if (ret < 0) \ + return ret; \ + \ + if (field_bits > sizeof(unpacked->_name) * 8) \ + return -1; \ + \ + unpacked->_name = field[1]; \ + in += ret; + + BCH_INODE_FIELDS_v2() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, + const u8 *in, const u8 *end, + unsigned nr_fields) +{ + unsigned fieldnr = 0; + int ret; + u64 v[2]; + +#define x(_name, _bits) \ + if (fieldnr < nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode_fast(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS_v2() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static int bch2_inode_unpack_v3(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); + unsigned fieldnr = 0; + int ret; + u64 v[2]; + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); + unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); + unpacked->bi_size = le64_to_cpu(inode.v->bi_size); + unpacked->bi_version = le64_to_cpu(inode.v->bi_version); + unpacked->bi_mode = INODEv3_MODE(inode.v); + +#define x(_name, _bits) \ + if (fieldnr < nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode_fast(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS_v3() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + memset(unpacked, 0, sizeof(*unpacked)); + + switch (k.k->type) { + case KEY_TYPE_inode: { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= 0; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + if (INODE_NEW_VARINT(inode.v)) { + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODE_NR_FIELDS(inode.v)); + } else { + return bch2_inode_unpack_v1(inode, unpacked); + } + break; + } + case KEY_TYPE_inode_v2: { + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODEv2_NR_FIELDS(inode.v)); + } + default: + BUG(); + } +} + +int bch2_inode_unpack(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + if (likely(k.k->type == KEY_TYPE_inode_v3)) + return bch2_inode_unpack_v3(k, unpacked); + return bch2_inode_unpack_slowpath(k, unpacked); +} + +static int bch2_inode_peek_nowarn(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) +{ + struct bkey_s_c k; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; + if (ret) + goto err; + + ret = bch2_inode_unpack(k, inode); + if (ret) + goto err; + + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + +int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) +{ + int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); + bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); + return ret; +} + +int bch2_inode_write_flags(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + enum btree_update_flags flags) +{ + struct bkey_inode_buf *inode_p; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack_inlined(inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; + return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); +} + +struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) +{ + struct bch_inode_unpacked u; + struct bkey_inode_buf *inode_p; + int ret; + + if (!bkey_is_inode(&k->k)) + return ERR_PTR(-ENOENT); + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return ERR_CAST(inode_p); + + ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); + if (ret) + return ERR_PTR(ret); + + bch2_inode_pack(inode_p, &u); + return &inode_p->inode.k_i; +} + +static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) +{ + struct bch_inode_unpacked unpacked; + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode, c, err, + inode_pos_inode_nonzero, + "nonzero k.p.inode"); + + bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err, + inode_pos_blockdev_range, + "fs inode in blockdev range"); + + bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err, + inode_unpack_error, + "invalid variable length fields"); + + bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err, + inode_checksum_type_invalid, + "invalid data checksum type (%u >= %u", + unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); + + bkey_fsck_err_on(unpacked.bi_compression && + !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err, + inode_compression_type_invalid, + "invalid compression opt %u", unpacked.bi_compression - 1); + + bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) && + unpacked.bi_nlink != 0, c, err, + inode_unlinked_but_nlink_nonzero, + "flagged as unlinked but bi_nlink != 0"); + + bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err, + inode_subvol_root_but_not_dir, + "subvolume root but not a directory"); +fsck_err: + return ret; +} + +int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + int ret = 0; + + bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); + + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; +} + +int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + int ret = 0; + + bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); + + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; +} + +int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + int ret = 0; + + bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err, + inode_v3_fields_start_bad, + "invalid fields_start (got %llu, min %u max %zu)", + INODEv3_FIELDS_START(inode.v), + INODEv3_FIELDS_START_INITIAL, + bkey_val_u64s(inode.k)); + + bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err, + inode_str_hash_invalid, + "invalid str hash type (%llu >= %u)", + INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); + + ret = __bch2_inode_invalid(c, k, err); +fsck_err: + return ret; +} + +static void __bch2_inode_unpacked_to_text(struct printbuf *out, + struct bch_inode_unpacked *inode) +{ + prt_printf(out, "mode=%o ", inode->bi_mode); + + prt_str(out, "flags="); + prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1)); + prt_printf(out, " (%x)", inode->bi_flags); + + prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu", + inode->bi_journal_seq, + inode->bi_size, + inode->bi_sectors, + inode->bi_version); + +#define x(_name, _bits) \ + prt_printf(out, " "#_name "=%llu", (u64) inode->_name); + BCH_INODE_FIELDS_v3() +#undef x +} + +void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +{ + prt_printf(out, "inum: %llu ", inode->bi_inum); + __bch2_inode_unpacked_to_text(out, inode); +} + +void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_inode_unpacked inode; + + if (bch2_inode_unpack(k, &inode)) { + prt_printf(out, "(unpack error)"); + return; + } + + __bch2_inode_unpacked_to_text(out, &inode); +} + +static inline u64 bkey_inode_flags(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); + case KEY_TYPE_inode_v2: + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); + case KEY_TYPE_inode_v3: + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); + default: + return 0; + } +} + +static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +{ + return bkey_inode_flags(k) & BCH_INODE_unlinked; +} + +int bch2_trans_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); + bool old_deleted = bkey_is_deleted_inode(old); + bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); + + if (nr) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + struct replicas_delta_list *d = trans->fs_usage_deltas; + + if (ret) + return ret; + + d->nr_inodes += nr; + } + + if (old_deleted != new_deleted) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); + if (ret) + return ret; + } + + return 0; +} + +int bch2_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage; + u64 journal_seq = trans->journal_res.seq; + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; + + BUG_ON(!journal_seq); + BUG_ON(new.k->type != KEY_TYPE_inode_v3); + + v->bi_journal_seq = cpu_to_le64(journal_seq); + } + + if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); + preempt_disable(); + + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += bkey_is_inode(new.k); + fs_usage->nr_inodes -= bkey_is_inode(old.k); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + } + return 0; +} + +int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode, c, err, + inode_pos_inode_nonzero, + "nonzero k.p.inode"); +fsck_err: + return ret; +} + +void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); + + prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); +} + +void bch2_inode_init_early(struct bch_fs *c, + struct bch_inode_unpacked *inode_u) +{ + enum bch_str_hash_type str_hash = + bch2_str_hash_opt_to_type(c, c->opts.str_hash); + + memset(inode_u, 0, sizeof(*inode_u)); + + /* ick */ + inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; + get_random_bytes(&inode_u->bi_hash_seed, + sizeof(inode_u->bi_hash_seed)); +} + +void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ + inode_u->bi_mode = mode; + inode_u->bi_uid = uid; + inode_u->bi_gid = gid; + inode_u->bi_dev = rdev; + inode_u->bi_atime = now; + inode_u->bi_mtime = now; + inode_u->bi_ctime = now; + inode_u->bi_otime = now; + + if (parent && parent->bi_mode & S_ISGID) { + inode_u->bi_gid = parent->bi_gid; + if (S_ISDIR(mode)) + inode_u->bi_mode |= S_ISGID; + } + + if (parent) { +#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; + BCH_INODE_OPTS() +#undef x + } +} + +void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ + bch2_inode_init_early(c, inode_u); + bch2_inode_init_late(inode_u, bch2_current_time(c), + uid, gid, mode, rdev, parent); +} + +static inline u32 bkey_generation(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + case KEY_TYPE_inode_v2: + BUG(); + case KEY_TYPE_inode_generation: + return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); + default: + return 0; + } +} + +/* + * This just finds an empty slot: + */ +int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, + u32 snapshot, u64 cpu) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + u64 min, max, start, pos, *hint; + int ret = 0; + unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + + if (c->opts.shard_inode_numbers) { + bits -= c->inode_shard_bits; + + min = (cpu << bits); + max = (cpu << bits) | ~(ULLONG_MAX << bits); + + min = max_t(u64, min, BLOCKDEV_INODE_MAX); + hint = c->unused_inode_hints + cpu; + } else { + min = BLOCKDEV_INODE_MAX; + max = ~(ULLONG_MAX << bits); + hint = c->unused_inode_hints; + } + + start = READ_ONCE(*hint); + + if (start >= max || start < min) + start = min; + + pos = start; + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); +again: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_lt(k.k->p, POS(0, max))) { + if (pos < iter->pos.offset) + goto found_slot; + + /* + * We don't need to iterate over keys in every snapshot once + * we've found just one: + */ + pos = iter->pos.offset + 1; + bch2_btree_iter_set_pos(iter, POS(0, pos)); + } + + if (!ret && pos < max) + goto found_slot; + + if (!ret && start == min) + ret = -BCH_ERR_ENOSPC_inode_create; + + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ret; + } + + /* Retry from start */ + pos = start = min; + bch2_btree_iter_set_pos(iter, POS(0, pos)); + goto again; +found_slot: + bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ret; + } + + *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); + return 0; +} + +static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + struct bpos end = POS(inum.inum, U64_MAX); + u32 snapshot; + int ret = 0; + + /* + * We're never going to be deleting partial extents, no need to use an + * extent iterator: + */ + bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), + BTREE_ITER_INTENT); + + while (1) { + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_upto(&iter, end); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k) + break; + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + break; + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + u32 snapshot; + int ret; + + /* + * If this was a directory, there shouldn't be any real dirents left - + * but there could be whiteouts (from hash collisions) that we should + * delete: + * + * XXX: the dirent could ideally would delete whiteouts when they're no + * longer needed + */ + ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); + if (ret) + goto err; +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(c, + "inode %llu:%u not found when deleting", + inum.inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(k, &inode_u); + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + return ret; +} + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + return bch2_trans_do(c, NULL, NULL, 0, + bch2_inode_find_by_inum_trans(trans, inum, inode)); +} + +int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +{ + if (bi->bi_flags & BCH_INODE_unlinked) + bi->bi_flags &= ~BCH_INODE_unlinked; + else { + if (bi->bi_nlink == U32_MAX) + return -EINVAL; + + bi->bi_nlink++; + } + + return 0; +} + +void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) +{ + if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) { + bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", + bi->bi_inum); + return; + } + + if (bi->bi_flags & BCH_INODE_unlinked) { + bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); + return; + } + + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_unlinked; +} + +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) +{ + struct bch_opts ret = { 0 }; +#define x(_name, _bits) \ + if (inode->bi_##_name) \ + opt_set(ret, _name, inode->bi_##_name - 1); + BCH_INODE_OPTS() +#undef x + return ret; +} + +void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, + struct bch_inode_unpacked *inode) +{ +#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); + BCH_INODE_OPTS() +#undef x + + if (opts->nocow) + opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; +} + +int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) +{ + struct bch_inode_unpacked inode; + int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode)); + + if (ret) + return ret; + + bch2_inode_opts_get(opts, trans->c, &inode); + return 0; +} + +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + int ret; + + do { + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); + } while (ret == -BCH_ERR_transaction_restart_nested); + if (ret) + goto err; +retry: + bch2_trans_begin(trans); + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(c, + "inode %llu:%u not found when deleting", + inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) + bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + return ret ?: -BCH_ERR_transaction_restart_nested; +} + +static int may_delete_deleted_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos, + bool *need_another_pass) +{ + struct bch_fs *c = trans->c; + struct btree_iter inode_iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; + if (fsck_err_on(!bkey_is_inode(k.k), c, + deleted_inode_missing, + "nonexistent inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + ret = bch2_inode_unpack(k, &inode); + if (ret) + goto out; + + if (S_ISDIR(inode.bi_mode)) { + ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); + if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, + "non empty directory %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + if (ret) + goto out; + } + + if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, + deleted_inode_not_unlinked, + "non-deleted inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + if (c->sb.clean && + !fsck_err(c, + deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) { + ret = 0; + goto out; + } + + if (bch2_snapshot_is_internal_node(c, pos.snapshot)) { + struct bpos new_min_pos; + + ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos); + if (ret) + goto out; + + inode.bi_flags &= ~BCH_INODE_unlinked; + + ret = bch2_inode_write_flags(trans, &inode_iter, &inode, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch_err_msg(c, ret, "clearing inode unlinked flag"); + if (ret) + goto out; + + /* + * We'll need another write buffer flush to pick up the new + * unlinked inodes in the snapshot leaves: + */ + *need_another_pass = true; + goto out; + } + + ret = 1; +out: +fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); + return ret; +delete: + ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + goto out; +} + +int bch2_delete_dead_inodes(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + bool need_another_pass; + int ret; +again: + need_another_pass = false; + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + /* + * Weird transaction restart handling here because on successful delete, + * bch2_inode_rm_snapshot() will return a nested transaction restart, + * but we can't retry because the btree write buffer won't have been + * flushed and we'd spin: + */ + for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); + if (ret < 0) + break; + + if (ret) { + if (!test_bit(BCH_FS_RW, &c->flags)) { + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); + } + + bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); + + ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + if (!ret && need_another_pass) + goto again; +err: + bch2_trans_put(trans); + + return ret; +} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 index 000000000000..88818a332b1e --- /dev/null +++ b/fs/bcachefs/inode.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H + +#include "bkey.h" +#include "bkey_methods.h" +#include "opts.h" + +enum bkey_invalid_flags; +extern const char * const bch2_inode_opts[]; + +int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + +#define bch2_bkey_ops_inode ((struct bkey_ops) { \ + .key_invalid = bch2_inode_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 16, \ +}) + +#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_inode_v2_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 32, \ +}) + +#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ + .key_invalid = bch2_inode_v3_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 48, \ +}) + +static inline bool bkey_is_inode(const struct bkey *k) +{ + return k->type == KEY_TYPE_inode || + k->type == KEY_TYPE_inode_v2 || + k->type == KEY_TYPE_inode_v3; +} + +int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ + .key_invalid = bch2_inode_generation_invalid, \ + .val_to_text = bch2_inode_generation_to_text, \ + .min_val_size = 8, \ +}) + +#if 0 +typedef struct { + u64 lo; + u32 hi; +} __packed __aligned(4) u96; +#endif +typedef u64 u96; + +struct bch_inode_unpacked { + u64 bi_inum; + u64 bi_journal_seq; + __le64 bi_hash_seed; + u64 bi_size; + u64 bi_sectors; + u64 bi_version; + u32 bi_flags; + u16 bi_mode; + +#define x(_name, _bits) u##_bits _name; + BCH_INODE_FIELDS_v3() +#undef x +}; + +struct bkey_inode_buf { + struct bkey_i_inode_v3 inode; + +#define x(_name, _bits) + 8 + _bits / 8 + u8 _pad[0 + BCH_INODE_FIELDS_v3()]; +#undef x +} __packed __aligned(8); + +void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); +struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); + +void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); + +int bch2_inode_peek(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); + +int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, enum btree_update_flags); + +static inline int bch2_inode_write(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode) +{ + return bch2_inode_write_flags(trans, iter, inode, 0); +} + +void bch2_inode_init_early(struct bch_fs *, + struct bch_inode_unpacked *); +void bch2_inode_init_late(struct bch_inode_unpacked *, u64, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); +void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); + +int bch2_inode_create(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, u32, u64); + +int bch2_inode_rm(struct bch_fs *, subvol_inum); + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, + subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); + +#define inode_opt_get(_c, _inode, _name) \ + ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) + +static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, + enum inode_opt_id id, u64 v) +{ + switch (id) { +#define x(_name, ...) \ + case Inode_opt_##_name: \ + inode->bi_##_name = v; \ + break; + BCH_INODE_OPTS() +#undef x + default: + BUG(); + } +} + +static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, + enum inode_opt_id id) +{ + switch (id) { +#define x(_name, ...) \ + case Inode_opt_##_name: \ + return inode->bi_##_name; + BCH_INODE_OPTS() +#undef x + default: + BUG(); + } +} + +static inline u8 mode_to_type(umode_t mode) +{ + return (mode >> 12) & 15; +} + +static inline u8 inode_d_type(struct bch_inode_unpacked *inode) +{ + return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); +} + +/* i_nlink: */ + +static inline unsigned nlink_bias(umode_t mode) +{ + return S_ISDIR(mode) ? 2 : 1; +} + +static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) +{ + return bi->bi_flags & BCH_INODE_unlinked + ? 0 + : bi->bi_nlink + nlink_bias(bi->bi_mode); +} + +static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, + unsigned nlink) +{ + if (nlink) { + bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); + bi->bi_flags &= ~BCH_INODE_unlinked; + } else { + bi->bi_nlink = 0; + bi->bi_flags |= BCH_INODE_unlinked; + } +} + +int bch2_inode_nlink_inc(struct bch_inode_unpacked *); +void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); + +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); +void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, + struct bch_inode_unpacked *); +int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); + +int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); +int bch2_delete_dead_inodes(struct bch_fs *); + +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c new file mode 100644 index 000000000000..bebc11444ef5 --- /dev/null +++ b/fs/bcachefs/io_misc.c @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * io_misc.c - fallocate, fpunch, truncate: + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" +#include "inode.h" +#include "io_misc.h" +#include "io_write.h" +#include "logged_ops.h" +#include "rebalance.h" +#include "subvolume.h" + +/* Overwrites whatever was present with zeroes: */ +int bch2_extent_fallocate(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + u64 sectors, + struct bch_io_opts opts, + s64 *i_sectors_delta, + struct write_point_specifier write_point) +{ + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = { 0 }; + struct closure cl; + struct open_buckets open_buckets = { 0 }; + struct bkey_s_c k; + struct bkey_buf old, new; + unsigned sectors_allocated = 0; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; + int ret; + + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + + if (!have_reservation) { + unsigned new_replicas = + max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto err; + + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) + goto err; + + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { + struct bkey_i_reservation *reservation; + + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); + reservation = bkey_reservation_init(new.k); + reservation->k.p = iter->pos; + bch2_key_resize(&reservation->k, sectors); + reservation->v.nr_replicas = opts.data_replicas; + } else { + struct bkey_i_extent *e; + struct bch_devs_list devs_have; + struct write_point *wp; + struct bch_extent_ptr *ptr; + + devs_have.nr = 0; + + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); + + e = bkey_extent_init(new.k); + e->k.p = iter->pos; + + ret = bch2_alloc_sectors_start_trans(trans, + opts.foreground_target, + false, + write_point, + &devs_have, + opts.data_replicas, + opts.data_replicas, + BCH_WATERMARK_normal, 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + ret = -BCH_ERR_transaction_restart_nested; + if (ret) + goto err; + + sectors = min_t(u64, sectors, wp->sectors_free); + sectors_allocated = sectors; + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + } + + have_reservation = true; + + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, + 0, i_sectors_delta, true); +err: + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + + bch2_open_buckets_put(c, &open_buckets); + bch2_disk_reservation_put(c, &disk_res); + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); + + if (closure_nr_remaining(&cl) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + + return ret; +} + +/* + * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + subvol_inum inum, u64 end, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + u32 snapshot; + + while (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + if (ret) + ret2 = ret; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + /* + * peek_upto() doesn't have ideal semantics for extents: + */ + k = bch2_btree_iter_peek_upto(iter, end_pos); + if (!k.k) + break; + + ret = bkey_err(k); + if (ret) + continue; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end_pos, &delete); + + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); + } + + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + + return ret; +} + +/* truncate: */ + +void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); + + prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); + prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); + prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size)); +} + +static int truncate_set_isize(struct btree_trans *trans, + subvol_inum inum, + u64 new_i_size) +{ + struct btree_iter iter = { NULL }; + struct bch_inode_unpacked inode_u; + int ret; + + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + (inode_u.bi_size = new_i_size, 0) ?: + bch2_inode_write(trans, &iter, &inode_u); + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, + struct bkey_i *op_k, + u64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter fpunch_iter; + struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + u64 new_i_size = le64_to_cpu(op->v.new_i_size); + int ret; + + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + truncate_set_isize(trans, inum, new_i_size)); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, + POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), + BTREE_ITER_INTENT); + ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); + bch2_trans_iter_exit(trans, &fpunch_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; +err: + bch2_logged_op_finish(trans, op_k); + return ret; +} + +int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) +{ + return __bch2_resume_logged_op_truncate(trans, op_k, NULL); +} + +int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) +{ + struct bkey_i_logged_op_truncate op; + + bkey_logged_op_truncate_init(&op.k_i); + op.v.subvol = cpu_to_le32(inum.subvol); + op.v.inum = cpu_to_le64(inum.inum); + op.v.new_i_size = cpu_to_le64(new_i_size); + + /* + * Logged ops aren't atomic w.r.t. snapshot creation: creating a + * snapshot while they're in progress, then crashing, will result in the + * resume only proceeding in one of the snapshots + */ + down_read(&c->snapshot_create_lock); + int ret = bch2_trans_run(c, + bch2_logged_op_start(trans, &op.k_i) ?: + __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta)); + up_read(&c->snapshot_create_lock); + + return ret; +} + +/* finsert/fcollapse: */ + +void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); + + prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); + prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); + prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); + prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); +} + +static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len) +{ + struct btree_iter iter; + struct bch_inode_unpacked inode_u; + int ret; + + offset <<= 9; + len <<= 9; + + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + if (ret) + return ret; + + if (len > 0) { + if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { + ret = -EFBIG; + goto err; + } + + if (offset >= inode_u.bi_size) { + ret = -EINVAL; + goto err; + } + } + + inode_u.bi_size += len; + inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c); + + ret = bch2_inode_write(trans, &iter, &inode_u); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, + struct bkey_i *op_k, + u64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + struct bch_io_opts opts; + u64 dst_offset = le64_to_cpu(op->v.dst_offset); + u64 src_offset = le64_to_cpu(op->v.src_offset); + s64 shift = dst_offset - src_offset; + u64 len = abs(shift); + u64 pos = le64_to_cpu(op->v.pos); + bool insert = shift > 0; + int ret = 0; + + ret = bch2_inum_opts_get(trans, inum, &opts); + if (ret) + return ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inum.inum, 0), + BTREE_ITER_INTENT); + + switch (op->v.state) { +case LOGGED_OP_FINSERT_start: + op->v.state = LOGGED_OP_FINSERT_shift_extents; + + if (insert) { + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + adjust_i_size(trans, inum, src_offset, len) ?: + bch2_logged_op_update(trans, &op->k_i)); + if (ret) + goto err; + } else { + bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); + + ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_logged_op_update(trans, &op->k_i)); + } + + fallthrough; +case LOGGED_OP_FINSERT_shift_extents: + while (1) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete, *copy; + struct bkey_s_c k; + struct bpos src_pos = POS(inum.inum, src_offset); + u32 snapshot; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto btree_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + + k = insert + ? bch2_btree_iter_peek_prev(&iter) + : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + if ((ret = bkey_err(k))) + goto btree_err; + + if (!k.k || + k.k->p.inode != inum.inum || + bkey_le(k.k->p, POS(inum.inum, src_offset))) + break; + + copy = bch2_bkey_make_mut_noupdate(trans, k); + if ((ret = PTR_ERR_OR_ZERO(copy))) + goto btree_err; + + if (insert && + bkey_lt(bkey_start_pos(k.k), src_pos)) { + bch2_cut_front(src_pos, copy); + + /* Splitting compressed extent? */ + bch2_disk_reservation_add(c, &disk_res, + copy->k.size * + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), + BCH_DISK_RESERVATION_NOFAIL); + } + + bkey_init(&delete.k); + delete.k.p = copy->k.p; + delete.k.p.snapshot = snapshot; + delete.k.size = copy->k.size; + + copy->k.p.offset += shift; + copy->k.p.snapshot = snapshot; + + op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); + + ret = bch2_bkey_set_needs_rebalance(c, copy, + opts.background_target, + opts.background_compression) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: + bch2_logged_op_update(trans, &op->k_i) ?: + bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); +btree_err: + bch2_disk_reservation_put(c, &disk_res); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + + pos = le64_to_cpu(op->v.pos); + } + + op->v.state = LOGGED_OP_FINSERT_finish; + + if (!insert) { + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + adjust_i_size(trans, inum, src_offset, shift) ?: + bch2_logged_op_update(trans, &op->k_i)); + } else { + /* We need an inode update to update bi_journal_seq for fsync: */ + ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + adjust_i_size(trans, inum, 0, 0) ?: + bch2_logged_op_update(trans, &op->k_i)); + } + + break; +case LOGGED_OP_FINSERT_finish: + break; + } +err: + bch2_logged_op_finish(trans, op_k); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) +{ + return __bch2_resume_logged_op_finsert(trans, op_k, NULL); +} + +int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 len, bool insert, + s64 *i_sectors_delta) +{ + struct bkey_i_logged_op_finsert op; + s64 shift = insert ? len : -len; + + bkey_logged_op_finsert_init(&op.k_i); + op.v.subvol = cpu_to_le32(inum.subvol); + op.v.inum = cpu_to_le64(inum.inum); + op.v.dst_offset = cpu_to_le64(offset + shift); + op.v.src_offset = cpu_to_le64(offset); + op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); + + /* + * Logged ops aren't atomic w.r.t. snapshot creation: creating a + * snapshot while they're in progress, then crashing, will result in the + * resume only proceeding in one of the snapshots + */ + down_read(&c->snapshot_create_lock); + int ret = bch2_trans_run(c, + bch2_logged_op_start(trans, &op.k_i) ?: + __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta)); + up_read(&c->snapshot_create_lock); + + return ret; +} diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h new file mode 100644 index 000000000000..9cb44a7c43c1 --- /dev/null +++ b/fs/bcachefs/io_misc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_MISC_H +#define _BCACHEFS_IO_MISC_H + +int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, + u64, struct bch_io_opts, s64 *, + struct write_point_specifier); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); + +void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \ + .val_to_text = bch2_logged_op_truncate_to_text, \ + .min_val_size = 24, \ +}) + +int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *); + +int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *); + +void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \ + .val_to_text = bch2_logged_op_finsert_to_text, \ + .min_val_size = 24, \ +}) + +int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *); + +int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *); + +#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c new file mode 100644 index 000000000000..36763865facd --- /dev/null +++ b/fs/bcachefs/io_read.c @@ -0,0 +1,1210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "data_update.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io_read.h" +#include "io_misc.h" +#include "io_write.h" +#include "subvolume.h" +#include "trace.h" + +#include <linux/sched/mm.h> + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); + + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} + +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + +/* Cache promotion on read */ + +struct promote_op { + struct rcu_head rcu; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + + struct data_update write; + struct bio_vec bi_inline_vecs[0]; /* must be last */ +}; + +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) +{ + BUG_ON(!opts.promote_target); + + if (!(flags & BCH_READ_MAY_PROMOTE)) + return -BCH_ERR_nopromote_may_not; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return -BCH_ERR_nopromote_already_promoted; + + if (bkey_extent_is_unwritten(k)) + return -BCH_ERR_nopromote_unwritten; + + if (bch2_target_congested(c, opts.promote_target)) + return -BCH_ERR_nopromote_congested; + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return -BCH_ERR_nopromote_in_flight; + + return 0; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + bch2_data_update_exit(&op->write); + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); +} + +static void promote_done(struct bch_write_op *wop) +{ + struct promote_op *op = + container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.op.c; + + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + op->start_time); + promote_free(c, op); +} + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ + struct bio *bio = &op->write.op.wbio.bio; + + trace_and_count(op->write.op.c, read_promote, &rbio->bio); + + /* we now own pages: */ + BUG_ON(!rbio->bounce); + BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); + + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + + bch2_data_update_read_done(&op->write, rbio->pick.crc); +} + +static struct promote_op *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned sectors, + struct bch_read_bio **rbio) +{ + struct bch_fs *c = trans->c; + struct promote_op *op = NULL; + struct bio *bio; + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + return NULL; + + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); + if (!op) + goto err; + + op->start_time = local_clock(); + op->pos = pos; + + /* + * We don't use the mempool here because extents that aren't + * checksummed or compressed can be too big for the mempool: + */ + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * pages, + GFP_NOFS); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, + GFP_NOFS)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; + + bio = &op->write.op.wbio.bio; + bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); + + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, + writepoint_hashed((unsigned long) current), + opts, + (struct data_update_opts) { + .target = opts.promote_target, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, + }, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + + op->write.op.end_io = promote_done; + + return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return NULL; +} + +noinline +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) +{ + struct bch_fs *c = trans->c; + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* data might have to be decompressed in the write path: */ + unsigned sectors = promote_full + ? max(pick->crc.compressed_size, pick->crc.live_size) + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + int ret; + + ret = should_promote(c, k, pos, opts, flags); + if (ret) + goto nopromote; + + promote = __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); + if (!promote) { + ret = -BCH_ERR_nopromote_enomem; + goto nopromote; + } + + *bounce = true; + *read_full = promote_full; + return promote; +nopromote: + trace_read_nopromote(c, ret); + return NULL; +} + +/* Read */ + +#define READ_RETRY_AVOID 1 +#define READ_RETRY 2 +#define READ_ERR 3 + +enum rbio_context { + RBIO_CONTEXT_NULL, + RBIO_CONTEXT_HIGHPRI, + RBIO_CONTEXT_UNBOUND, +}; + +static inline struct bch_read_bio * +bch2_rbio_parent(struct bch_read_bio *rbio) +{ + return rbio->split ? rbio->parent : rbio; +} + +__always_inline +static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, + enum rbio_context context, + struct workqueue_struct *wq) +{ + if (context <= rbio->context) { + fn(&rbio->work); + } else { + rbio->work.func = fn; + rbio->context = context; + queue_work(wq, &rbio->work); + } +} + +static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) +{ + BUG_ON(rbio->bounce && !rbio->split); + + if (rbio->promote) + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; +} + +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ +static void bch2_rbio_done(struct bch_read_bio *rbio) +{ + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); + bio_endio(&rbio->bio); +} + +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + bch2_bkey_buf_init(&sk); + + bch2_trans_iter_init(trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (bkey_err(k)) + goto err; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(trans); + + if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, + rbio->data_pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; +out: + bch2_rbio_done(rbio); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; +} + +static void bch2_rbio_retry(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; + struct bch_io_failures failed = { .nr = 0 }; + + trace_and_count(c, read_retry, &rbio->bio); + + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + + rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + + flags |= BCH_READ_IN_RETRY; + flags &= ~BCH_READ_MAY_PROMOTE; + + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inum, &failed, flags); + } +} + +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) +{ + rbio->retry = retry; + + if (rbio->flags & BCH_READ_IN_RETRY) + return; + + if (retry == READ_ERR) { + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; + bch2_rbio_done(rbio); + } else { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } +} + +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + struct bch_extent_crc_unpacked new_crc; + struct btree_iter iter; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; + + if (crc_is_compressed(rbio->pick.crc)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if ((ret = bkey_err(k))) + goto out; + + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) + goto out; + + /* Extent was merged? */ + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) + goto out; + + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; + goto out; + } + + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_rbio_narrow_crcs(trans, rbio)); +} + +/* Inner part that may run in process context */ +static void __bch2_read_endio(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; + struct bch_csum csum; + int ret; + + nofs_flags = memalloc_nofs_save(); + + /* Reset iterator for checksumming and copying bounced data: */ + if (rbio->bounce) { + src->bi_iter.bi_size = crc.compressed_size << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; + } else { + src->bi_iter = rbio->bvec_iter; + } + + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + goto csum_err; + + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + + if (rbio->flags & BCH_READ_NODECODE) + goto nodecode; + + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); + + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + + if (rbio->promote) { + /* + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + promote_start(rbio->promote, rbio); + rbio->promote = NULL; + } +nodecode: + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +out: + memalloc_nofs_restore(nofs_flags); + return; +csum_err: + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } + + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; +decompression_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +decrypt_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +} + +static void bch2_read_endio(struct bio *bio) +{ + struct bch_read_bio *rbio = + container_of(bio, struct bch_read_bio, bio); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct workqueue_struct *wq = NULL; + enum rbio_context context = RBIO_CONTEXT_NULL; + + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } + + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; + } + + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { + trace_and_count(c, read_reuse_race, &rbio->bio); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + return; + } + + if (rbio->narrow_crcs || + rbio->promote || + crc_is_compressed(rbio->pick.crc) || + bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) + context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + else if (rbio->pick.crc.csum_type) + context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; + + bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); +} + +int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_buf *orig_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), 0); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_offset_ratelimited(trans->c, + orig_k->k->k.p.inode, + orig_k->k->k.p.offset << 9, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + PTR_BUCKET_POS(c, &ptr), + BTREE_ITER_CACHED); + + prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + printbuf_indent_add(&buf, 2); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + + bch2_fs_inconsistent(c, "%s", buf.buf); + + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +} + +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca = NULL; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; + + if (bkey_extent_is_inline_data(k.k)) { + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_inline_data_bytes(k.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } +retry_pick: + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "no device to read from"); + goto err; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); + + if (flags & BCH_READ_NODECODE) { + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_size = pick.crc.compressed_size << 9; + goto get_bio; + } + + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) + flags |= BCH_READ_MUST_BOUNCE; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_USER_MAPPED)) || + (flags & BCH_READ_MUST_BOUNCE)))) { + read_full = true; + bounce = true; + } + + if (orig->opts.promote_target) + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || + offset_into_extent)); + + data_pos.offset += offset_into_extent; + pick.ptr.offset += pick.crc.offset + + offset_into_extent; + offset_into_extent = 0; + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + } +get_bio: + if (rbio) { + /* + * promote already allocated bounce rbio: + * promote needs to allocate a bio big enough for uncompressing + * data in the write path, but we're not going to use it all + * here: + */ + EBUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + + rbio = rbio_init(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), + orig->opts); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; + rbio->split = true; + } else if (flags & BCH_READ_MUST_CLONE) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't + * work, when it reports the error to its parent (us) we don't + * know if the error was from our bio, and we should retry, or + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), + orig->opts); + rbio->bio.bi_iter = iter; + rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + } + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + + rbio->c = c; + rbio->submit_time = local_clock(); + if (rbio->split) + rbio->parent = orig; + else + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; + rbio->flags = flags; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->version; + rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + + if (rbio->bounce) + trace_and_count(c, read_bounce, &rbio->bio); + + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); + trace_and_count(c, read_split, &orig->bio); + } + + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, + read_pos.offset << 9, + "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(trans, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { + return 0; + } else { + int ret; + + rbio->context = RBIO_CONTEXT_UNBOUND; + bch2_read_endio(&rbio->bio); + + ret = rbio->retry; + rbio = bch2_rbio_free(rbio); + + if (ret == READ_RETRY_AVOID) { + bch2_mark_io_failure(failed, &pick); + ret = READ_RETRY; + } + + if (!ret) + goto out_read_done; + + return ret; + } + +err: + if (flags & BCH_READ_IN_RETRY) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; + +hole: + /* + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); +out_read_done: + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; +} + +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + u32 snapshot; + int ret; + + BUG_ON(flags & BCH_READ_NODECODE); + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == READ_RETRY || + ret == READ_RETRY_AVOID) + goto retry; + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { + bch_err_inum_offset_ratelimited(c, inum.inum, + bvec_iter.bi_sector << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } +} + +void bch2_fs_io_read_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_read_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; + + return 0; +} diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h new file mode 100644 index 000000000000..d9c18bb7d403 --- /dev/null +++ b/fs/bcachefs/io_read.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_READ_H +#define _BCACHEFS_IO_READ_H + +#include "bkey_buf.h" + +struct bch_read_bio { + struct bch_fs *c; + u64 start_time; + u64 submit_time; + + /* + * Reads will often have to be split, and if the extent being read from + * was checksummed or compressed we'll also have to allocate bounce + * buffers and copy the data back into the original bio. + * + * If we didn't have to split, we have to save and restore the original + * bi_end_io - @split below indicates which: + */ + union { + struct bch_read_bio *parent; + bio_end_io_t *end_io; + }; + + /* + * Saved copy of bio->bi_iter, from submission time - allows us to + * resubmit on IO error, and also to copy data back to the original bio + * when we're bouncing: + */ + struct bvec_iter bvec_iter; + + unsigned offset_into_extent; + + u16 flags; + union { + struct { + u16 bounce:1, + split:1, + kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, + retry:2, + context:2; + }; + u16 _state; + }; + + struct bch_devs_list devs_have; + + struct extent_ptr_decoded pick; + + /* + * pos we read from - different from data_pos for indirect extents: + */ + u32 subvol; + struct bpos read_pos; + + /* + * start pos of data we read (may not be pos of data we want) - for + * promote, narrow extents paths: + */ + enum btree_id data_btree; + struct bpos data_pos; + struct bversion version; + + struct promote_op *promote; + + struct bch_io_opts opts; + + struct work_struct work; + + struct bio bio; +}; + +#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) + +struct bch_devs_mask; +struct cache_promote_op; +struct extent_ptr_decoded; + +int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, + struct bkey_buf *); + +static inline int bch2_read_indirect_extent(struct btree_trans *trans, + enum btree_id *data_btree, + unsigned *offset_into_extent, + struct bkey_buf *k) +{ + if (k->k->k.type != KEY_TYPE_reflink_p) + return 0; + + *data_btree = BTREE_ID_reflink; + return __bch2_read_indirect_extent(trans, offset_into_extent, k); +} + +enum bch_read_flags { + BCH_READ_RETRY_IF_STALE = 1 << 0, + BCH_READ_MAY_PROMOTE = 1 << 1, + BCH_READ_USER_MAPPED = 1 << 2, + BCH_READ_NODECODE = 1 << 3, + BCH_READ_LAST_FRAGMENT = 1 << 4, + + /* internal: */ + BCH_READ_MUST_BOUNCE = 1 << 5, + BCH_READ_MUST_CLONE = 1 << 6, + BCH_READ_IN_RETRY = 1 << 7, +}; + +int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, + struct bvec_iter, struct bpos, enum btree_id, + struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + +static inline void bch2_read_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) +{ + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags); +} + +void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); + +static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum) +{ + struct bch_io_failures failed = { .nr = 0 }; + + BUG_ON(rbio->_state); + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +} + +static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_io_opts opts) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->_state = 0; + rbio->promote = NULL; + rbio->opts = opts; + return rbio; +} + +void bch2_fs_io_read_exit(struct bch_fs *); +int bch2_fs_io_read_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_READ_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c new file mode 100644 index 000000000000..8ede46b1e354 --- /dev/null +++ b/fs/bcachefs/io_write.c @@ -0,0 +1,1675 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "bset.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "debug.h" +#include "ec.h" +#include "error.h" +#include "extent_update.h" +#include "inode.h" +#include "io_write.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "nocow_locking.h" +#include "rebalance.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/blkdev.h> +#include <linux/prefetch.h> +#include <linux/random.h> +#include <linux/sched/mm.h> + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + +static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, + u64 now, int rw) +{ + u64 latency_capable = + ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; + /* ideally we'd be taking into account the device's variance here: */ + u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); + s64 latency_over = io_latency - latency_threshold; + + if (latency_threshold && latency_over > 0) { + /* + * bump up congested by approximately latency_over * 4 / + * latency_threshold - we don't need much accuracy here so don't + * bother with the divide: + */ + if (atomic_read(&ca->congested) < CONGESTED_MAX) + atomic_add(latency_over >> + max_t(int, ilog2(latency_threshold) - 2, 0), + &ca->congested); + + ca->congested_last = now; + } else if (atomic_read(&ca->congested) > 0) { + atomic_dec(&ca->congested); + } +} + +void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) +{ + atomic64_t *latency = &ca->cur_latency[rw]; + u64 now = local_clock(); + u64 io_latency = time_after64(now, submit_time) + ? now - submit_time + : 0; + u64 old, new, v = atomic64_read(latency); + + do { + old = v; + + /* + * If the io latency was reasonably close to the current + * latency, skip doing the update and atomic operation - most of + * the time: + */ + if (abs((int) (old - io_latency)) < (old >> 1) && + now & ~(~0U << 5)) + break; + + new = ewma_add(old, io_latency, 5); + } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + + bch2_congested_acct(ca, io_latency, now, rw); + + __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); +} + +#endif + +/* Allocate, free from mempool: */ + +void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) +{ + struct bvec_iter_all iter; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, iter) + if (bv->bv_page != ZERO_PAGE(0)) + mempool_free(bv->bv_page, &c->bio_bounce_pages); + bio->bi_vcnt = 0; +} + +static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) +{ + struct page *page; + + if (likely(!*using_mempool)) { + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) { + mutex_lock(&c->bio_bounce_pages_lock); + *using_mempool = true; + goto pool_alloc; + + } + } else { +pool_alloc: + page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); + } + + return page; +} + +void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + size_t size) +{ + bool using_mempool = false; + + while (size) { + struct page *page = __bio_alloc_page_pool(c, &using_mempool); + unsigned len = min_t(size_t, PAGE_SIZE, size); + + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; + } + + if (using_mempool) + mutex_unlock(&c->bio_bounce_pages_lock); +} + +/* Extent update path: */ + +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); + int ret = 0; + + *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; + + bch2_trans_copy_iter(&iter, extent_iter); + + for_each_btree_key_upto_continue_norestart(iter, + new->k.p, BTREE_ITER_SLOTS, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); + + *i_sectors_delta += sectors * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + + *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); + *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; + + if (!*usage_increasing && + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + + if (bkey_ge(old.k->p, new->k.p)) + break; + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + struct btree_iter *extent_iter, + u64 new_i_size, + s64 i_sectors_delta) +{ + struct btree_iter iter; + struct bkey_i *k; + struct bkey_i_inode_v3 *inode; + /* + * Crazy performance optimization: + * Every extent update needs to also update the inode: the inode trigger + * will set bi->journal_seq to the journal sequence number of this + * transaction - for fsync. + * + * But if that's the only reason we're updating the inode (we're not + * updating bi_size or bi_sectors), then we don't need the inode update + * to be journalled - if we crash, the bi_journal_seq update will be + * lost, but that's fine. + */ + unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), + BTREE_ITER_CACHED); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { + k = bch2_inode_to_v3(trans, k); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + goto err; + } + + inode = bkey_i_to_inode_v3(k); + + if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && + new_i_size > le64_to_cpu(inode->v.bi_size)) { + inode->v.bi_size = cpu_to_le64(new_i_size); + inode_update_flags = 0; + } + + if (i_sectors_delta) { + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); + inode_update_flags = 0; + } + + if (inode->k.p.snapshot != iter.snapshot) { + inode->k.p.snapshot = iter.snapshot; + inode_update_flags = 0; + } + + ret = bch2_trans_update(trans, &iter, &inode->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + inode_update_flags); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 new_i_size, + s64 *i_sectors_delta_total, + bool check_enospc) +{ + struct bpos next_pos; + bool usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_extent_trim_atomic(trans, iter, k); + if (ret) + return ret; + + next_pos = k->k.p; + + ret = bch2_sum_sector_overwrites(trans, iter, k, + &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + return ret; + + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + !check_enospc || !usage_increasing + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + + /* + * Note: + * We always have to do an inode update - even when i_size/i_sectors + * aren't changing - for fsync to work properly; fsync relies on + * inode->bi_journal_seq which is updated by the trigger code: + */ + ret = bch2_extent_update_i_size_sectors(trans, iter, + min(k->k.p.offset << 9, new_i_size), + i_sectors_delta) ?: + bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); + if (unlikely(ret)) + return ret; + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); + return 0; +} + +static int bch2_write_index_default(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct bkey_buf sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; + int ret; + + BUG_ON(!inum.subvol); + + bch2_bkey_buf_init(&sk); + + do { + bch2_trans_begin(trans); + + k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, + &sk.k->k.p.snapshot); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + ret = bch2_bkey_set_needs_rebalance(c, sk.k, + op->opts.background_target, + op->opts.background_compression) ?: + bch2_extent_update(trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_ge(iter.pos, k->k.p)) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +} + +/* Writes */ + +void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + enum bch_data_type type, + const struct bkey_i *k, + bool nocow) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + const struct bch_extent_ptr *ptr; + struct bch_write_bio *n; + struct bch_dev *ca; + + BUG_ON(c->opts.nochanges); + + bkey_for_each_ptr(ptrs, ptr) { + BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || + !c->devs[ptr->dev]); + + ca = bch_dev_bkey_exists(c, ptr->dev); + + if (to_entry(ptr + 1) < ptrs.end) { + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, + GFP_NOFS, &ca->replica_set)); + + n->bio.bi_end_io = wbio->bio.bi_end_io; + n->bio.bi_private = wbio->bio.bi_private; + n->parent = wbio; + n->split = true; + n->bounce = false; + n->put_bio = true; + n->bio.bi_opf = wbio->bio.bi_opf; + bio_inc_remaining(&wbio->bio); + } else { + n = wbio; + n->split = false; + } + + n->c = c; + n->dev = ptr->dev; + n->have_ioref = nocow || bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); + n->nocow = nocow; + n->submit_time = local_clock(); + n->inode_offset = bkey_start_offset(&k->k); + n->bio.bi_iter.bi_sector = ptr->offset; + + if (likely(n->have_ioref)) { + this_cpu_add(ca->io_done->sectors[WRITE][type], + bio_sectors(&n->bio)); + + bio_set_dev(&n->bio, ca->disk_sb.bdev); + + if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { + bio_endio(&n->bio); + continue; + } + + submit_bio(&n->bio); + } else { + n->bio.bi_status = BLK_STS_REMOVED; + bio_endio(&n->bio); + } + } +} + +static void __bch2_write(struct bch_write_op *); + +static void bch2_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + + EBUG_ON(op->open_buckets.nr); + + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_disk_reservation_put(c, &op->res); + + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + + EBUG_ON(cl->parent); + closure_debug_destroy(cl); + if (op->end_io) + op->end_io(op); +} + +static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) +{ + struct keylist *keys = &op->insert_keys; + struct bch_extent_ptr *ptr; + struct bkey_i *src, *dst = keys->keys, *n; + + for (src = keys->keys; src != keys->top; src = n) { + n = bkey_next(src); + + if (bkey_extent_is_direct_data(&src->k)) { + bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, + test_bit(ptr->dev, op->failed.d)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) + return -EIO; + } + + if (dst != src) + memmove_u64s_down(dst, src, src->k.u64s); + dst = bkey_next(dst); + } + + keys->top = dst; + return 0; +} + +/** + * __bch2_write_index - after a write, update index to point to new data + * @op: bch_write_op to process + */ +static void __bch2_write_index(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + unsigned dev; + int ret = 0; + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; + } + + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + + ret = !(op->flags & BCH_WRITE_MOVE) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + BUG_ON(keylist_sectors(keys) && !ret); + + op->written += sectors_start - keylist_sectors(keys); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + insert->k.p.inode, insert->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) + goto err; + } +out: + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + + bch2_open_buckets_put(c, &op->open_buckets); + return; +err: + keys->top = keys->keys; + op->error = ret; + op->flags |= BCH_WRITE_DONE; + goto out; +} + +static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) +{ + if (state != wp->state) { + u64 now = ktime_get_ns(); + + if (wp->last_state_change && + time_after64(now, wp->last_state_change)) + wp->time[wp->state] += now - wp->last_state_change; + wp->state = state; + wp->last_state_change = now; + } +} + +static inline void wp_update_state(struct write_point *wp, bool running) +{ + enum write_point_state state; + + state = running ? WRITE_POINT_running : + !list_empty(&wp->writes) ? WRITE_POINT_waiting_io + : WRITE_POINT_stopped; + + __wp_update_state(wp, state); +} + +static CLOSURE_CALLBACK(bch2_write_index) +{ + closure_type(op, struct bch_write_op, cl); + struct write_point *wp = op->wp; + struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; + + if ((op->flags & BCH_WRITE_DONE) && + (op->flags & BCH_WRITE_MOVE)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); + + spin_lock_irqsave(&wp->writes_lock, flags); + if (wp->state == WRITE_POINT_waiting_io) + __wp_update_state(wp, WRITE_POINT_waiting_work); + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock_irqrestore (&wp->writes_lock, flags); + + queue_work(wq, &wp->index_update_work); +} + +static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) +{ + op->wp = wp; + + if (wp->state == WRITE_POINT_stopped) { + spin_lock_irq(&wp->writes_lock); + __wp_update_state(wp, WRITE_POINT_waiting_io); + spin_unlock_irq(&wp->writes_lock); + } +} + +void bch2_write_point_do_index_updates(struct work_struct *work) +{ + struct write_point *wp = + container_of(work, struct write_point, index_update_work); + struct bch_write_op *op; + + while (1) { + spin_lock_irq(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op) + list_del(&op->wp_list); + wp_update_state(wp, op != NULL); + spin_unlock_irq(&wp->writes_lock); + + if (!op) + break; + + op->flags |= BCH_WRITE_IN_WORKER; + + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + __bch2_write(op); + else + bch2_write_done(&op->cl); + } +} + +static void bch2_write_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_fs *c = wbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + set_bit(wbio->dev, op->failed.d); + op->flags |= BCH_WRITE_IO_ERROR; + } + + if (wbio->nocow) + set_bit(wbio->dev, op->devs_need_flush->d); + + if (wbio->have_ioref) { + bch2_latency_acct(ca, wbio->submit_time, WRITE); + percpu_ref_put(&ca->io_ref); + } + + if (wbio->bounce) + bch2_bio_free_pages_pool(c, bio); + + if (wbio->put_bio) + bio_put(bio); + + if (parent) + bio_endio(&parent->bio); + else + closure_put(cl); +} + +static void init_append_extent(struct bch_write_op *op, + struct write_point *wp, + struct bversion version, + struct bch_extent_crc_unpacked crc) +{ + struct bkey_i_extent *e; + + op->pos.offset += crc.uncompressed_size; + + e = bkey_extent_init(op->insert_keys.top); + e->k.p = op->pos; + e->k.size = crc.uncompressed_size; + e->k.version = version; + + if (crc.csum_type || + crc.compression_type || + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); + + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, + op->flags & BCH_WRITE_CACHED); + + bch2_keylist_push(&op->insert_keys); +} + +static struct bio *bch2_write_bio_alloc(struct bch_fs *c, + struct write_point *wp, + struct bio *src, + bool *page_alloc_failed, + void *buf) +{ + struct bch_write_bio *wbio; + struct bio *bio; + unsigned output_available = + min(wp->sectors_free << 9, src->bi_iter.bi_size); + unsigned pages = DIV_ROUND_UP(output_available + + (buf + ? ((unsigned long) buf & (PAGE_SIZE - 1)) + : 0), PAGE_SIZE); + + pages = min(pages, BIO_MAX_VECS); + + bio = bio_alloc_bioset(NULL, pages, 0, + GFP_NOFS, &c->bio_write); + wbio = wbio_init(bio); + wbio->put_bio = true; + /* copy WRITE_SYNC flag */ + wbio->bio.bi_opf = src->bi_opf; + + if (buf) { + bch2_bio_map(bio, buf, output_available); + return bio; + } + + wbio->bounce = true; + + /* + * We can't use mempool for more than c->sb.encoded_extent_max + * worth of pages, but we'd like to allocate more if we can: + */ + bch2_bio_alloc_pages_pool(c, bio, + min_t(unsigned, output_available, + c->opts.encoded_extent_max)); + + if (bio->bi_iter.bi_size < output_available) + *page_alloc_failed = + bch2_bio_alloc_pages(bio, + output_available - + bio->bi_iter.bi_size, + GFP_NOFS) != 0; + + return bio; +} + +static int bch2_write_rechecksum(struct bch_fs *c, + struct bch_write_op *op, + unsigned new_csum_type) +{ + struct bio *bio = &op->wbio.bio; + struct bch_extent_crc_unpacked new_crc; + int ret; + + /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ + + if (bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)) + new_csum_type = op->crc.csum_type; + + ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); + if (ret) + return ret; + + bio_advance(bio, op->crc.offset << 9); + bio->bi_iter.bi_size = op->crc.live_size << 9; + op->crc = new_crc; + return 0; +} + +static int bch2_write_decrypt(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct nonce nonce = extent_nonce(op->version, op->crc); + struct bch_csum csum; + int ret; + + if (!bch2_csum_type_is_encryption(op->crc.csum_type)) + return 0; + + /* + * If we need to decrypt data in the write path, we'll no longer be able + * to verify the existing checksum (poly1305 mac, in this case) after + * it's decrypted - this is the last point we'll be able to reverify the + * checksum: + */ + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + return -EIO; + + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + return ret; +} + +static enum prep_encoded_ret { + PREP_ENCODED_OK, + PREP_ENCODED_ERR, + PREP_ENCODED_CHECKSUM_ERR, + PREP_ENCODED_DO_WRITE, +} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +{ + struct bch_fs *c = op->c; + struct bio *bio = &op->wbio.bio; + + if (!(op->flags & BCH_WRITE_DATA_ENCODED)) + return PREP_ENCODED_OK; + + BUG_ON(bio_sectors(bio) != op->crc.compressed_size); + + /* Can we just write the entire extent as is? */ + if (op->crc.uncompressed_size == op->crc.live_size && + op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && + op->crc.compressed_size <= wp->sectors_free && + (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && + op->csum_type != op->crc.csum_type && + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + + return PREP_ENCODED_DO_WRITE; + } + + /* + * If the data is compressed and we couldn't write the entire extent as + * is, we have to decompress it: + */ + if (crc_is_compressed(op->crc)) { + struct bch_csum csum; + + if (bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; + + /* Last point we can still verify checksum: */ + csum = bch2_checksum_bio(c, op->crc.csum_type, + extent_nonce(op->version, op->crc), + bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + + if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + return PREP_ENCODED_ERR; + } + + /* + * No longer have compressed data after this point - data might be + * encrypted: + */ + + /* + * If the data is checksummed and we're only writing a subset, + * rechecksum and adjust bio to point to currently live data: + */ + if ((op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) && + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + + /* + * If we want to compress the data, it has to be decrypted: + */ + if ((op->compression_opt || + bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(op->csum_type)) && + bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; + + return PREP_ENCODED_OK; +} + +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + struct bio **_dst) +{ + struct bch_fs *c = op->c; + struct bio *src = &op->wbio.bio, *dst = src; + struct bvec_iter saved_iter; + void *ec_buf; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; + int ret, more = 0; + + BUG_ON(!bio_sectors(src)); + + ec_buf = bch2_writepoint_ec_buf(c, wp); + + switch (bch2_write_prep_encoded_data(op, wp)) { + case PREP_ENCODED_OK: + break; + case PREP_ENCODED_ERR: + ret = -EIO; + goto err; + case PREP_ENCODED_CHECKSUM_ERR: + goto csum_err; + case PREP_ENCODED_DO_WRITE: + /* XXX look for bug here */ + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; + } + + if (ec_buf || + op->compression_opt || + (op->csum_type && + !(op->flags & BCH_WRITE_PAGES_STABLE)) || + (bch2_csum_type_is_encryption(op->csum_type) && + !(op->flags & BCH_WRITE_PAGES_OWNED))) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } + + saved_iter = dst->bi_iter; + + do { + struct bch_extent_crc_unpacked crc = { 0 }; + struct bversion version = op->version; + size_t dst_len = 0, src_len = 0; + + if (page_alloc_failed && + dst->bi_iter.bi_size < (wp->sectors_free << 9) && + dst->bi_iter.bi_size < c->opts.encoded_extent_max) + break; + + BUG_ON(op->compression_opt && + (op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_opt && !bounce); + + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible + : op->compression_opt + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_opt) + : 0; + if (!crc_is_compressed(crc)) { + dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); + + if (op->csum_type) + dst_len = min_t(unsigned, dst_len, + c->opts.encoded_extent_max); + + if (bounce) { + swap(dst->bi_iter.bi_size, dst_len); + bio_copy_data(dst, src); + swap(dst->bi_iter.bi_size, dst_len); + } + + src_len = dst_len; + } + + BUG_ON(!src_len || !dst_len); + + if (bch2_csum_type_is_encryption(op->csum_type)) { + if (bversion_zero(version)) { + version.lo = atomic64_inc_return(&c->key_version); + } else { + crc.nonce = op->nonce; + op->nonce += src_len >> 9; + } + } + + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { + u8 compression_type = crc.compression_type; + u16 nonce = crc.nonce; + /* + * Note: when we're using rechecksum(), we need to be + * checksumming @src because it has all the data our + * existing checksum covers - if we bounced (because we + * were trying to compress), @dst will only have the + * part of the data the new checksum will cover. + * + * But normally we want to be checksumming post bounce, + * because part of the reason for bouncing is so the + * data can't be modified (by userspace) while it's in + * flight. + */ + if (bch2_rechecksum_bio(c, src, version, op->crc, + &crc, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->csum_type)) + goto csum_err; + /* + * rchecksum_bio sets compression_type on crc from op->crc, + * this isn't always correct as sometimes we're changing + * an extent from uncompressed to incompressible. + */ + crc.compression_type = compression_type; + crc.nonce = nonce; + } else { + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_rechecksum_bio(c, src, version, op->crc, + NULL, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->crc.csum_type)) + goto csum_err; + + crc.compressed_size = dst_len >> 9; + crc.uncompressed_size = src_len >> 9; + crc.live_size = src_len >> 9; + + swap(dst->bi_iter.bi_size, dst_len); + ret = bch2_encrypt_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + if (ret) + goto err; + + crc.csum = bch2_checksum_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + crc.csum_type = op->csum_type; + swap(dst->bi_iter.bi_size, dst_len); + } + + init_append_extent(op, wp, version, crc); + + if (dst != src) + bio_advance(dst, dst_len); + bio_advance(src, src_len); + total_output += dst_len; + total_input += src_len; + } while (dst->bi_iter.bi_size && + src->bi_iter.bi_size && + wp->sectors_free && + !bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)); + + more = src->bi_iter.bi_size != 0; + + dst->bi_iter = saved_iter; + + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, + GFP_NOFS, &c->bio_write); + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; + } + + dst->bi_iter.bi_size = total_output; +do_write: + *_dst = dst; + return more; +csum_err: + bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + ret = -EIO; +err: + if (to_wbio(dst)->bounce) + bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) + bio_put(dst); + + return ret; +} + +static bool bch2_extent_is_writeable(struct bch_write_op *op, + struct bkey_s_c k) +{ + struct bch_fs *c = op->c; + struct bkey_s_c_extent e; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + unsigned replicas = 0; + + if (k.k->type != KEY_TYPE_extent) + return false; + + e = bkey_s_c_to_extent(k); + extent_for_each_ptr_decode(e, p, entry) { + if (crc_is_encoded(p.crc) || p.has_ec) + return false; + + replicas += bch2_extent_ptr_durability(c, &p); + } + + return replicas >= op->opts.data_replicas; +} + +static inline void bch2_nocow_write_unlock(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + const struct bch_extent_ptr *ptr; + struct bkey_i *k; + + for_each_keylist_key(&op->insert_keys, k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + + bkey_for_each_ptr(ptrs, ptr) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); + } +} + +static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *orig, + struct bkey_s_c k, + u64 new_i_size) +{ + struct bkey_i *new; + struct bkey_ptrs ptrs; + struct bch_extent_ptr *ptr; + int ret; + + if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { + /* trace this */ + return 0; + } + + new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bch2_cut_front(bkey_start_pos(&orig->k), new); + bch2_cut_back(orig->k.p, new); + + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) + ptr->unwritten = 0; + + /* + * Note that we're not calling bch2_subvol_get_snapshot() in this path - + * that was done when we kicked off the write, and here it's important + * that we update the extent that we wrote to - even if a snapshot has + * since been created. The write is still outstanding, so we're ok + * w.r.t. snapshot atomicity: + */ + return bch2_extent_update_i_size_sectors(trans, iter, + min(new->k.p.offset << 9, new_i_size), 0) ?: + bch2_trans_update(trans, iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_i *orig; + struct bkey_s_c k; + int ret; + + for_each_keylist_key(&op->insert_keys, orig) { + ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + bkey_start_pos(&orig->k), orig->k.p, + BTREE_ITER_INTENT, k, + NULL, NULL, BTREE_INSERT_NOFAIL, ({ + bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); + })); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + insert->k.p.inode, insert->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) { + op->error = ret; + break; + } + } + + bch2_trans_put(trans); +} + +static void __bch2_nocow_write_done(struct bch_write_op *op) +{ + bch2_nocow_write_unlock(op); + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + op->error = -EIO; + } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + bch2_nocow_write_convert_unwritten(op); +} + +static CLOSURE_CALLBACK(bch2_nocow_write_done) +{ + closure_type(op, struct bch_write_op, cl); + + __bch2_nocow_write_done(op); + bch2_write_done(cl); +} + +static void bch2_nocow_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + struct { + struct bpos b; + unsigned gen; + struct nocow_lock_bucket *l; + } buckets[BCH_REPLICAS_MAX]; + unsigned nr_buckets = 0; + u32 snapshot; + int ret, i; + + if (op->flags & BCH_WRITE_MOVE) + return; + + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); + if (unlikely(ret)) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(op->pos.inode, op->pos.offset, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bio *bio = &op->wbio.bio; + + nr_buckets = 0; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + /* fall back to normal cow write path? */ + if (unlikely(k.k->p.snapshot != snapshot || + !bch2_extent_is_writeable(op, k))) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + k.k->u64s)) + break; + + /* Get iorefs before dropping btree locks: */ + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); + buckets[nr_buckets].gen = ptr->gen; + buckets[nr_buckets].l = + bucket_nocow_lock(&c->nocow_locks, + bucket_to_u64(buckets[nr_buckets].b)); + + prefetch(buckets[nr_buckets].l); + + if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) + goto err_get_ioref; + + nr_buckets++; + + if (ptr->unwritten) + op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + } + + /* Unlock before taking nocow locks, doing IO: */ + bkey_reassemble(op->insert_keys.top, k); + bch2_trans_unlock(trans); + + bch2_cut_front(op->pos, op->insert_keys.top); + if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + for (i = 0; i < nr_buckets; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); + struct nocow_lock_bucket *l = buckets[i].l; + bool stale; + + __bch2_bucket_nocow_lock(&c->nocow_locks, l, + bucket_to_u64(buckets[i].b), + BUCKET_NOCOW_LOCK_UPDATE); + + rcu_read_lock(); + stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); + rcu_read_unlock(); + + if (unlikely(stale)) + goto err_bucket_stale; + } + + bio = &op->wbio.bio; + if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { + bio = bio_split(bio, k.k->p.offset - op->pos.offset, + GFP_KERNEL, &c->bio_write); + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { + op->flags |= BCH_WRITE_DONE; + } + + op->pos.offset += bio_sectors(bio); + op->written += bio_sectors(bio); + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); + if (op->flags & BCH_WRITE_DONE) + break; + bch2_btree_iter_advance(&iter); + } +out: + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s: btree lookup error %s", + __func__, bch2_err_str(ret)); + op->error = ret; + op->flags |= BCH_WRITE_DONE; + } + + bch2_trans_put(trans); + + /* fallback to cow write path? */ + if (!(op->flags & BCH_WRITE_DONE)) { + closure_sync(&op->cl); + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; + } else if (op->flags & BCH_WRITE_SYNC) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl.work); + } else { + /* + * XXX + * needs to run out of process context because ei_quota_lock is + * a mutex + */ + continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); + } + return; +err_get_ioref: + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* Fall back to COW path: */ + goto out; +err_bucket_stale: + while (i >= 0) { + bch2_bucket_nocow_unlock(&c->nocow_locks, + buckets[i].b, + BUCKET_NOCOW_LOCK_UPDATE); + --i; + } + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + goto out; +} + +static void __bch2_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct write_point *wp = NULL; + struct bio *bio = NULL; + unsigned nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { + bch2_nocow_write(op); + if (op->flags & BCH_WRITE_DONE) + goto out_nofs_restore; + } +again: + memset(&op->failed, 0, sizeof(op->failed)); + + do { + struct bkey_i *key_to_write; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + + /* +1 for possible cache device: */ + if (op->open_buckets.nr + op->nr_replicas + 1 > + ARRAY_SIZE(op->open_buckets.v)) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)) + break; + + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_alloc_sectors_start_trans(trans, + op->target, + op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, + op->nr_replicas_required, + op->watermark, + op->flags, + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ? NULL : &op->cl, &wp)); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; + + goto err; + } + + EBUG_ON(!wp); + + bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); + + bch2_alloc_sectors_done_inlined(c, wp); +err: + if (ret <= 0) { + op->flags |= BCH_WRITE_DONE; + + if (ret < 0) { + op->error = ret; + break; + } + } + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + + closure_get(bio->bi_private); + + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); + + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + key_to_write, false); + } while (ret); + + /* + * Sync or no? + * + * If we're running asynchronously, wne may still want to block + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. + */ + if ((op->flags & BCH_WRITE_SYNC) || + (!(op->flags & BCH_WRITE_DONE) && + !(op->flags & BCH_WRITE_IN_WORKER))) { + closure_sync(&op->cl); + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + goto again; + bch2_write_done(&op->cl); + } else { + bch2_write_queue(op, wp); + continue_at(&op->cl, bch2_write_index, NULL); + } +out_nofs_restore: + memalloc_nofs_restore(nofs_flags); +} + +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) +{ + struct bio *bio = &op->wbio.bio; + struct bvec_iter iter; + struct bkey_i_inline_data *id; + unsigned sectors; + int ret; + + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_U64s + DIV_ROUND_UP(data_len, 8)); + if (ret) { + op->error = ret; + goto err; + } + + sectors = bio_sectors(bio); + op->pos.offset += sectors; + + id = bkey_inline_data_init(op->insert_keys.top); + id->k.p = op->pos; + id->k.version = op->version; + id->k.size = sectors; + + iter = bio->bi_iter; + iter.bi_size = data_len; + memcpy_from_bio(id->v.data, bio, iter); + + while (data_len & 7) + id->v.data[data_len++] = '\0'; + set_bkey_val_bytes(&id->k, data_len); + bch2_keylist_push(&op->insert_keys); + + __bch2_write_index(op); +err: + bch2_write_done(&op->cl); +} + +/** + * bch2_write() - handle a write to a cache device or flash only volume + * @cl: &bch_write_op->cl + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data won't fit in a single open bucket, there will be multiple keys); + * after the data is written it calls bch_journal, and after the keys have been + * added to the next journal write they're inserted into the btree. + * + * If op->discard is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. + */ +CLOSURE_CALLBACK(bch2_write) +{ + closure_type(op, struct bch_write_op, cl); + struct bio *bio = &op->wbio.bio; + struct bch_fs *c = op->c; + unsigned data_len; + + EBUG_ON(op->cl.parent); + BUG_ON(!op->nr_replicas); + BUG_ON(!op->write_point.v); + BUG_ON(bkey_eq(op->pos, POS_MAX)); + + op->start_time = local_clock(); + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + + if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "misaligned write"); + op->error = -EIO; + goto err; + } + + if (c->opts.nochanges) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + if (!(op->flags & BCH_WRITE_MOVE) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + data_len = min_t(u64, bio->bi_iter.bi_size, + op->new_i_size - (op->pos.offset << 9)); + + if (c->opts.inline_data && + data_len <= min(block_bytes(c) / 2, 1024U)) { + bch2_write_data_inline(op, data_len); + return; + } + + __bch2_write(op); + return; +err: + bch2_disk_reservation_put(c, &op->res); + + closure_debug_destroy(&op->cl); + if (op->end_io) + op->end_io(op); +} + +static const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x + NULL +}; + +void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) +{ + prt_str(out, "pos: "); + bch2_bpos_to_text(out, op->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "started: "); + bch2_pr_time_units(out, local_clock() - op->start_time); + prt_newline(out); + + prt_str(out, "flags: "); + prt_bitflags(out, bch2_write_flags, op->flags); + prt_newline(out); + + prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +void bch2_fs_io_write_exit(struct bch_fs *c) +{ + mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->bio_write); +} + +int bch2_fs_io_write_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_write_init; + + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + + return 0; +} diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h new file mode 100644 index 000000000000..6c276a48f95d --- /dev/null +++ b/fs/bcachefs/io_write.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_WRITE_H +#define _BCACHEFS_IO_WRITE_H + +#include "checksum.h" +#include "io_write_types.h" + +#define to_wbio(_bio) \ + container_of((_bio), struct bch_write_bio, bio) + +void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); +void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + enum bch_data_type, const struct bkey_i *, bool); + +#define BCH_WRITE_FLAGS() \ + x(ALLOC_NOWAIT) \ + x(CACHED) \ + x(DATA_ENCODED) \ + x(PAGES_STABLE) \ + x(PAGES_OWNED) \ + x(ONLY_SPECIFIED_DEVS) \ + x(WROTE_DATA_INLINE) \ + x(FROM_INTERNAL) \ + x(CHECK_ENOSPC) \ + x(SYNC) \ + x(MOVE) \ + x(IN_WORKER) \ + x(DONE) \ + x(IO_ERROR) \ + x(CONVERT_UNWRITTEN) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->watermark == BCH_WATERMARK_copygc + ? op->c->copygc_wq + : op->c->btree_update_wq; +} + +int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, s64 *, s64 *); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64, s64 *, bool); + +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_io_opts opts) +{ + op->c = c; + op->end_io = NULL; + op->flags = 0; + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c, opts); + op->compression_opt = opts.compression; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->watermark = BCH_WATERMARK_normal; + op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; + op->target = 0; + op->opts = opts; + op->subvol = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->write_point = (struct write_point_specifier) { 0 }; + op->res = (struct disk_reservation) { 0 }; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; + op->devs_need_flush = NULL; +} + +CLOSURE_CALLBACK(bch2_write); +void bch2_write_point_do_index_updates(struct work_struct *); + +static inline struct bch_write_bio *wbio_init(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + + memset(&wbio->wbio, 0, sizeof(wbio->wbio)); + return wbio; +} + +void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); + +void bch2_fs_io_write_exit(struct bch_fs *); +int bch2_fs_io_write_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_WRITE_H */ diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h new file mode 100644 index 000000000000..c7f97c2c4805 --- /dev/null +++ b/fs/bcachefs/io_write_types.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_WRITE_TYPES_H +#define _BCACHEFS_IO_WRITE_TYPES_H + +#include "alloc_types.h" +#include "btree_types.h" +#include "buckets_types.h" +#include "extents_types.h" +#include "keylist_types.h" +#include "opts.h" +#include "super_types.h" + +#include <linux/llist.h> +#include <linux/workqueue.h> + +struct bch_write_bio { + struct_group(wbio, + struct bch_fs *c; + struct bch_write_bio *parent; + + u64 submit_time; + u64 inode_offset; + + struct bch_devs_list failed; + u8 dev; + + unsigned split:1, + bounce:1, + put_bio:1, + have_ioref:1, + nocow:1, + used_mempool:1, + first_btree_write:1; + ); + + struct bio bio; +}; + +struct bch_write_op { + struct closure cl; + struct bch_fs *c; + void (*end_io)(struct bch_write_op *); + u64 start_time; + + unsigned written; /* sectors */ + u16 flags; + s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ + + unsigned compression_opt:8; + unsigned csum_type:4; + unsigned nr_replicas:4; + unsigned nr_replicas_required:4; + unsigned watermark:3; + unsigned incompressible:1; + unsigned stripe_waited:1; + + struct bch_devs_list devs_have; + u16 target; + u16 nonce; + struct bch_io_opts opts; + + u32 subvol; + struct bpos pos; + struct bversion version; + + /* For BCH_WRITE_DATA_ENCODED: */ + struct bch_extent_crc_unpacked crc; + + struct write_point_specifier write_point; + + struct write_point *wp; + struct list_head wp_list; + + struct disk_reservation res; + + struct open_buckets open_buckets; + + u64 new_i_size; + s64 i_sectors_delta; + + struct bch_devs_mask failed; + + struct keylist insert_keys; + u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; + + /* + * Bitmask of devices that have had nocow writes issued to them since + * last flush: + */ + struct bch_devs_mask *devs_need_flush; + + /* Must be last: */ + struct bch_write_bio wbio; +}; + +#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 index 000000000000..8cf238be6213 --- /dev/null +++ b/fs/bcachefs/journal.c @@ -0,0 +1,1439 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs journalling code, for btree insertions + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_sb.h" +#include "journal_seq_blacklist.h" +#include "trace.h" + +static const char * const bch2_journal_errors[] = { +#define x(n) #n, + JOURNAL_ERRORS() +#undef x + NULL +}; + +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) +{ + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); +} + +static inline struct journal_buf * +journal_seq_to_buf(struct journal *j, u64 seq) +{ + struct journal_buf *buf = NULL; + + EBUG_ON(seq > journal_cur_seq(j)); + + if (journal_seq_unwritten(j, seq)) { + buf = j->buf + (seq & JOURNAL_BUF_MASK); + EBUG_ON(le64_to_cpu(buf->data->seq) != seq); + } + return buf; +} + +static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(p->list); i++) + INIT_LIST_HEAD(&p->list[i]); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, count); + p->devs.nr = 0; +} + +/* + * Detect stuck journal conditions and trigger shutdown. Technically the journal + * can end up stuck for a variety of reasons, such as a blocked I/O, journal + * reservation lockup, etc. Since this is a fatal error with potentially + * unpredictable characteristics, we want to be fairly conservative before we + * decide to shut things down. + * + * Consider the journal stuck when it appears full with no ability to commit + * btree transactions, to discard journal buckets, nor acquire priority + * (reserved watermark) reservation. + */ +static inline bool +journal_error_check_stuck(struct journal *j, int error, unsigned flags) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool stuck = false; + struct printbuf buf = PRINTBUF; + + if (!(error == JOURNAL_ERR_journal_full || + error == JOURNAL_ERR_journal_pin_full) || + nr_unwritten_journal_entries(j) || + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) + return stuck; + + spin_lock(&j->lock); + + if (j->can_discard) { + spin_unlock(&j->lock); + return stuck; + } + + stuck = true; + + /* + * The journal shutdown path will set ->err_seq, but do it here first to + * serialize against concurrent failures and avoid duplicate error + * reports. + */ + if (j->err_seq) { + spin_unlock(&j->lock); + return stuck; + } + j->err_seq = journal_cur_seq(j); + spin_unlock(&j->lock); + + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", + bch2_journal_errors[error]); + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "%s", buf.buf); + + printbuf_reset(&buf); + bch2_journal_pins_to_text(&buf, j); + bch_err(c, "Journal pins:\n%s", buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + dump_stack(); + + return stuck; +} + +/* + * Final processing when the last reference of a journal buffer has been + * dropped. Drop the pin list reference acquired at journal entry open and write + * the buffer, if requested. + */ +void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + lockdep_assert_held(&j->lock); + + if (__bch2_journal_pin_put(j, seq)) + bch2_journal_reclaim_fast(j); + if (write) + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); +} + +/* + * Returns true if journal entry is now closed: + * + * We don't close a journal_buf until the next journal_buf is finished writing, + * and can be opened again - this also initializes the next journal_buf: + */ +static void __journal_entry_close(struct journal *j, unsigned closed_val) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = journal_cur_buf(j); + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + unsigned sectors; + + BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && + closed_val != JOURNAL_ENTRY_ERROR_VAL); + + lockdep_assert_held(&j->lock); + + do { + old.v = new.v = v; + new.cur_entry_offset = closed_val; + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || + old.cur_entry_offset == new.cur_entry_offset) + return; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + if (!__journal_entry_is_open(old)) + return; + + /* Close out old buffer: */ + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + + sectors = vstruct_blocks_plus(buf->data, c->block_bits, + buf->u64s_reserved) << c->block_bits; + BUG_ON(sectors > buf->sectors); + buf->sectors = sectors; + + /* + * We have to set last_seq here, _before_ opening a new journal entry: + * + * A threads may replace an old pin with a new pin on their current + * journal reservation - the expectation being that the journal will + * contain either what the old pin protected or what the new pin + * protects. + * + * After the old pin is dropped journal_last_seq() won't include the old + * pin, so we can only write the updated last_seq on the entry that + * contains whatever the new pin protects. + * + * Restated, we can _not_ update last_seq for a given entry if there + * could be a newer entry open with reservations/pins that have been + * taken against it. + * + * Hence, we want update/set last_seq on the current journal entry right + * before we open a new one: + */ + buf->last_seq = journal_last_seq(j); + buf->data->last_seq = cpu_to_le64(buf->last_seq); + BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); + + cancel_delayed_work(&j->write_work); + + bch2_journal_space_available(j); + + __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); +} + +void bch2_journal_halt(struct journal *j) +{ + spin_lock(&j->lock); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + journal_wake(j); + spin_unlock(&j->lock); +} + +static bool journal_entry_want_write(struct journal *j) +{ + bool ret = !journal_entry_is_open(j) || + journal_cur_seq(j) == journal_last_unwritten_seq(j); + + /* Don't close it yet if we already have a write in flight: */ + if (ret) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + else if (nr_unwritten_journal_entries(j)) { + struct journal_buf *buf = journal_cur_buf(j); + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + } + + return ret; +} + +bool bch2_journal_entry_close(struct journal *j) +{ + bool ret; + + spin_lock(&j->lock); + ret = journal_entry_want_write(j); + spin_unlock(&j->lock); + + return ret; +} + +/* + * should _only_ called from journal_res_get() - when we actually want a + * journal reservation - journal entry is open means journal is dirty: + */ +static int journal_entry_open(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = j->buf + + ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); + union journal_res_state old, new; + int u64s; + u64 v; + + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + if (j->blocked) + return JOURNAL_ERR_blocked; + + if (j->cur_entry_error) + return j->cur_entry_error; + + if (bch2_journal_error(j)) + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + + if (!fifo_free(&j->pin)) + return JOURNAL_ERR_journal_pin_full; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) + return JOURNAL_ERR_max_in_flight; + + BUG_ON(!j->cur_entry_sectors); + + buf->expires = + (journal_cur_seq(j) == j->flushed_seq_ondisk + ? jiffies + : j->last_flush_write) + + msecs_to_jiffies(c->opts.journal_flush_delay); + + buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); + + u64s = (int) (buf->sectors << 9) / sizeof(u64) - + journal_entry_overhead(j); + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= (ssize_t) j->early_journal_entries.nr) + return JOURNAL_ERR_journal_full; + + if (fifo_empty(&j->pin) && j->reclaim_thread) + wake_up_process(j->reclaim_thread); + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for journal_last_seq() to be calculated correctly + */ + atomic64_inc(&j->seq); + journal_pin_list_init(fifo_push_ref(&j->pin), 1); + + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + + BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); + + bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; + + memset(buf->data, 0, sizeof(*buf->data)); + buf->data->seq = cpu_to_le64(journal_cur_seq(j)); + buf->data->u64s = 0; + + if (j->early_journal_entries.nr) { + memcpy(buf->data->_data, j->early_journal_entries.data, + j->early_journal_entries.nr * sizeof(u64)); + le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); + } + + /* + * Must be set before marking the journal entry as open: + */ + j->cur_entry_u64s = u64s; + + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + + BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); + + new.idx++; + BUG_ON(journal_state_count(new, new.idx)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + + journal_state_inc(&new); + + /* Handle any already added entries */ + new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + if (j->res_get_blocked_start) + bch2_time_stats_update(j->blocked_time, + j->res_get_blocked_start); + j->res_get_blocked_start = 0; + + mod_delayed_work(c->io_complete_wq, + &j->write_work, + msecs_to_jiffies(c->opts.journal_flush_delay)); + journal_wake(j); + + if (j->early_journal_entries.nr) + darray_exit(&j->early_journal_entries); + return 0; +} + +static bool journal_quiesced(struct journal *j) +{ + bool ret = atomic64_read(&j->seq) == j->seq_ondisk; + + if (!ret) + bch2_journal_entry_close(j); + return ret; +} + +static void journal_quiesce(struct journal *j) +{ + wait_event(j->wait, journal_quiesced(j)); +} + +static void journal_write_work(struct work_struct *work) +{ + struct journal *j = container_of(work, struct journal, write_work.work); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + long delta; + + spin_lock(&j->lock); + if (!__journal_entry_is_open(j->reservations)) + goto unlock; + + delta = journal_cur_buf(j)->expires - jiffies; + + if (delta > 0) + mod_delayed_work(c->io_complete_wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); +unlock: + spin_unlock(&j->lock); +} + +static int __journal_res_get(struct journal *j, struct journal_res *res, + unsigned flags) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf; + bool can_discard; + int ret; +retry: + if (journal_res_get_fast(j, res, flags)) + return 0; + + if (bch2_journal_error(j)) + return -BCH_ERR_erofs_journal_err; + + spin_lock(&j->lock); + + /* check once more in case somebody else shut things down... */ + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return -BCH_ERR_erofs_journal_err; + } + + /* + * Recheck after taking the lock, so we don't race with another thread + * that just did journal_entry_open() and call bch2_journal_entry_close() + * unnecessarily + */ + if (journal_res_get_fast(j, res, flags)) { + spin_unlock(&j->lock); + return 0; + } + + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { + /* + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ + ret = JOURNAL_ERR_journal_full; + goto unlock; + } + + /* + * If we couldn't get a reservation because the current buf filled up, + * and we had room for a bigger entry on disk, signal that we want to + * realloc the journal bufs: + */ + buf = journal_cur_buf(j); + if (journal_entry_is_open(j) && + buf->buf_size >> 9 < buf->disk_sectors && + buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); + + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + ret = journal_entry_open(j); + + if (ret == JOURNAL_ERR_max_in_flight) + trace_and_count(c, journal_entry_full, c); +unlock: + if ((ret && ret != JOURNAL_ERR_insufficient_devices) && + !j->res_get_blocked_start) { + j->res_get_blocked_start = local_clock() ?: 1; + trace_and_count(c, journal_full, c); + } + + can_discard = j->can_discard; + spin_unlock(&j->lock); + + if (!ret) + goto retry; + if (journal_error_check_stuck(j, ret, flags)) + ret = -BCH_ERR_journal_res_get_blocked; + + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && + !(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); + goto retry; + } + + if (mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } + } + + return ret == JOURNAL_ERR_insufficient_devices + ? -BCH_ERR_erofs_journal_err + : -BCH_ERR_journal_res_get_blocked; +} + +/* + * Essentially the entry function to the journaling code. When bcachefs is doing + * a btree insert, it calls this function to get the current journal write. + * Journal write is the structure used set up journal writes. The calling + * function will then add its keys to the structure, queuing them for the next + * write. + * + * To ensure forward progress, the current task must not be holding any + * btree node write locks. + */ +int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, + unsigned flags) +{ + int ret; + + closure_wait_event(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK)); + return ret; +} + +/* journal_entry_res: */ + +void bch2_journal_entry_res_resize(struct journal *j, + struct journal_entry_res *res, + unsigned new_u64s) +{ + union journal_res_state state; + int d = new_u64s - res->u64s; + + spin_lock(&j->lock); + + j->entry_u64s_reserved += d; + if (d <= 0) + goto out; + + j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); + smp_mb(); + state = READ_ONCE(j->reservations); + + if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && + state.cur_entry_offset > j->cur_entry_u64s) { + j->cur_entry_u64s += d; + /* + * Not enough room in current journal entry, have to flush it: + */ + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + } else { + journal_cur_buf(j)->u64s_reserved += d; + } +out: + spin_unlock(&j->lock); + res->u64s += d; +} + +/* journal flushing: */ + +/** + * bch2_journal_flush_seq_async - wait for a journal entry to be written + * @j: journal object + * @seq: seq to flush + * @parent: closure object to wait with + * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, + * -EIO if @seq will never be flushed + * + * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary + */ +int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) +{ + struct journal_buf *buf; + int ret = 0; + + if (seq <= j->flushed_seq_ondisk) + return 1; + + spin_lock(&j->lock); + + if (WARN_ONCE(seq > journal_cur_seq(j), + "requested to flush journal seq %llu, but currently at %llu", + seq, journal_cur_seq(j))) + goto out; + + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { + ret = -EIO; + goto out; + } + + if (seq <= j->flushed_seq_ondisk) { + ret = 1; + goto out; + } + + /* if seq was written, but not flushed - flush a newer one instead */ + seq = max(seq, journal_last_unwritten_seq(j)); + +recheck_need_open: + if (seq > journal_cur_seq(j)) { + struct journal_res res = { 0 }; + + if (journal_entry_is_open(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + + spin_unlock(&j->lock); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); + + bch2_journal_res_put(j, &res); + + spin_lock(&j->lock); + goto want_write; + } + + /* + * if write was kicked off without a flush, flush the next sequence + * number instead + */ + buf = journal_seq_to_buf(j, seq); + if (buf->noflush) { + seq++; + goto recheck_need_open; + } + + buf->must_flush = true; + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +want_write: + if (seq == journal_cur_seq(j)) + journal_entry_want_write(j); +out: + spin_unlock(&j->lock); + return ret; +} + +int bch2_journal_flush_seq(struct journal *j, u64 seq) +{ + u64 start_time = local_clock(); + int ret, ret2; + + /* + * Don't update time_stats when @seq is already flushed: + */ + if (seq <= j->flushed_seq_ondisk) + return 0; + + ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + + if (!ret) + bch2_time_stats_update(j->flush_seq_time, start_time); + + return ret ?: ret2 < 0 ? ret2 : 0; +} + +/* + * bch2_journal_flush_async - if there is an open journal entry, or a journal + * still being written, write it and wait for the write to complete + */ +void bch2_journal_flush_async(struct journal *j, struct closure *parent) +{ + bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); +} + +int bch2_journal_flush(struct journal *j) +{ + return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); +} + +/* + * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * @seq + */ +bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 unwritten_seq; + bool ret = false; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) + return false; + + if (seq <= c->journal.flushed_seq_ondisk) + return false; + + spin_lock(&j->lock); + if (seq <= c->journal.flushed_seq_ondisk) + goto out; + + for (unwritten_seq = journal_last_unwritten_seq(j); + unwritten_seq < seq; + unwritten_seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); + + /* journal write is already in flight, and was a flush write: */ + if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + goto out; + + buf->noflush = true; + } + + ret = true; +out: + spin_unlock(&j->lock); + return ret; +} + +int bch2_journal_meta(struct journal *j) +{ + struct journal_buf *buf; + struct journal_res res; + int ret; + + memset(&res, 0, sizeof(res)); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +} + +/* block/unlock the journal: */ + +void bch2_journal_unblock(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked--; + spin_unlock(&j->lock); + + journal_wake(j); +} + +void bch2_journal_block(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked++; + spin_unlock(&j->lock); + + journal_quiesce(j); +} + +/* allocate journal on a device: */ + +static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + bool new_fs, struct closure *cl) +{ + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; + u64 *new_bucket_seq = NULL, *new_buckets = NULL; + struct open_bucket **ob = NULL; + long *bu = NULL; + unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; + int ret = 0; + + BUG_ON(nr <= ja->nr); + + bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); + ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); + new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); + new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); + if (!bu || !ob || !new_buckets || !new_bucket_seq) { + ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + goto err_free; + } + + for (nr_got = 0; nr_got < nr_want; nr_got++) { + if (new_fs) { + bu[nr_got] = bch2_bucket_alloc_new_fs(ca); + if (bu[nr_got] < 0) { + ret = -BCH_ERR_ENOSPC_bucket_alloc; + break; + } + } else { + ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + ret = bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(trans, ca, + ob[nr_got]->bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + if (ret) { + bch2_open_bucket_put(c, ob[nr_got]); + bch_err_msg(c, ret, "marking new journal buckets"); + break; + } + + bu[nr_got] = ob[nr_got]->bucket; + } + } + + if (!nr_got) + goto err_free; + + /* Don't return an error if we successfully allocated some buckets: */ + ret = 0; + + if (c) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_block(&c->journal); + mutex_lock(&c->sb_lock); + } + + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); + + BUG_ON(ja->discard_idx > ja->nr); + + pos = ja->discard_idx ?: ja->nr; + + memmove(new_buckets + pos + nr_got, + new_buckets + pos, + sizeof(new_buckets[0]) * (ja->nr - pos)); + memmove(new_bucket_seq + pos + nr_got, + new_bucket_seq + pos, + sizeof(new_bucket_seq[0]) * (ja->nr - pos)); + + for (i = 0; i < nr_got; i++) { + new_buckets[pos + i] = bu[i]; + new_bucket_seq[pos + i] = 0; + } + + nr = ja->nr + nr_got; + + ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); + if (ret) + goto err_unblock; + + if (!new_fs) + bch2_write_super(c); + + /* Commit: */ + if (c) + spin_lock(&c->journal.lock); + + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + ja->nr = nr; + + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; + if (pos <= ja->dirty_idx_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; + if (pos <= ja->dirty_idx) + ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; + + if (c) + spin_unlock(&c->journal.lock); +err_unblock: + if (c) { + bch2_journal_unblock(&c->journal); + mutex_unlock(&c->sb_lock); + } + + if (ret && !new_fs) + for (i = 0; i < nr_got; i++) + bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(trans, ca, + bu[i], BCH_DATA_free, 0)); +err_free: + if (!new_fs) + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); + + kfree(new_bucket_seq); + kfree(new_buckets); + kfree(ob); + kfree(bu); + return ret; +} + +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + struct journal_device *ja = &ca->journal; + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + down_write(&c->state_lock); + + /* don't handle reducing nr of buckets yet: */ + if (nr < ja->nr) + goto unlock; + + while (ja->nr < nr) { + struct disk_reservation disk_res = { 0, 0, 0 }; + + /* + * note: journal buckets aren't really counted as _sectors_ used yet, so + * we don't need the disk reservation to avoid the BUG_ON() in buckets.c + * when space used goes up without a reservation - but we do need the + * reservation to ensure we'll actually be able to allocate: + * + * XXX: that's not right, disk reservations only ensure a + * filesystem-wide allocation will succeed, this is a device + * specific allocation - we can hang here: + */ + + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) + break; + + ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); + + bch2_disk_reservation_put(c, &disk_res); + + closure_sync(&cl); + + if (ret && ret != -BCH_ERR_bucket_alloc_blocked) + break; + } + + if (ret) + bch_err_fn(c, ret); +unlock: + up_write(&c->state_lock); + return ret; +} + +int bch2_dev_journal_alloc(struct bch_dev *ca) +{ + unsigned nr; + int ret; + + if (dynamic_fault("bcachefs:add:journal_alloc")) { + ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + goto err; + } + + /* 1/128th of the device by default: */ + nr = ca->mi.nbuckets >> 7; + + /* + * clamp journal size to 8192 buckets or 8GB (in sectors), whichever + * is smaller: + */ + nr = clamp_t(unsigned, nr, + BCH_JOURNAL_BUCKETS_MIN, + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); + + ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); +err: + if (ret) + bch_err_fn(ca, ret); + return ret; +} + +int bch2_fs_journal_alloc(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) { + if (ca->journal.nr) + continue; + + int ret = bch2_dev_journal_alloc(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + return ret; + } + } + + return 0; +} + +/* startup/shutdown: */ + +static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) +{ + bool ret = false; + u64 seq; + + spin_lock(&j->lock); + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j) && !ret; + seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, seq); + + if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) + ret = true; + } + spin_unlock(&j->lock); + + return ret; +} + +void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) +{ + wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); +} + +void bch2_fs_journal_stop(struct journal *j) +{ + bch2_journal_reclaim_stop(j); + bch2_journal_flush_all_pins(j); + + wait_event(j->wait, bch2_journal_entry_close(j)); + + /* + * Always write a new journal entry, to make sure the clock hands are up + * to date (and match the superblock) + */ + bch2_journal_meta(j); + + journal_quiesce(j); + + BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_REPLAY_DONE, &j->flags) && + j->last_empty_seq != journal_cur_seq(j)); + + cancel_delayed_work_sync(&j->write_work); +} + +int bch2_fs_journal_start(struct journal *j, u64 cur_seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + struct journal_replay *i, **_i; + struct genradix_iter iter; + bool had_entries = false; + unsigned ptr; + u64 last_seq = cur_seq, nr, seq; + + genradix_for_each_reverse(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + last_seq = le64_to_cpu(i->j.last_seq); + break; + } + + nr = cur_seq - last_seq; + + if (nr + 1 > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -BCH_ERR_ENOMEM_journal_pin_fifo; + } + } + + j->replay_journal_seq = last_seq; + j->replay_journal_seq_end = cur_seq; + j->last_seq_ondisk = last_seq; + j->flushed_seq_ondisk = cur_seq - 1; + j->seq_ondisk = cur_seq - 1; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + + fifo_for_each_entry_ptr(p, &j->pin, seq) + journal_pin_list_init(p, 1); + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + BUG_ON(seq >= cur_seq); + + if (seq < last_seq) + continue; + + if (journal_entry_empty(&i->j)) + j->last_empty_seq = le64_to_cpu(i->j.seq); + + p = journal_seq_pin(j, seq); + + p->devs.nr = 0; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + + had_entries = true; + } + + if (!had_entries) + j->last_empty_seq = cur_seq; + + spin_lock(&j->lock); + + set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; + + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); + j->reservations.unwritten_idx++; + + c->last_bucket_seq_cleanup = journal_cur_seq(j); + + bch2_journal_space_available(j); + spin_unlock(&j->lock); + + return bch2_journal_reclaim_start(j); +} + +/* init/exit: */ + +void bch2_dev_journal_exit(struct bch_dev *ca) +{ + kfree(ca->journal.bio); + kfree(ca->journal.buckets); + kfree(ca->journal.bucket_seq); + + ca->journal.bio = NULL; + ca->journal.buckets = NULL; + ca->journal.bucket_seq = NULL; +} + +int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch2_sb_field_get(sb, journal); + struct bch_sb_field_journal_v2 *journal_buckets_v2 = + bch2_sb_field_get(sb, journal_v2); + unsigned i, nr_bvecs; + + ja->nr = 0; + + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + + for (i = 0; i < nr; i++) + ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); + } else if (journal_buckets) { + ja->nr = bch2_nr_journal_buckets(journal_buckets); + } + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) + return -BCH_ERR_ENOMEM_dev_journal_init; + + nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + + ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!ca->journal.bio) + return -BCH_ERR_ENOMEM_dev_journal_init; + + bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + + ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->buckets) + return -BCH_ERR_ENOMEM_dev_journal_init; + + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + unsigned j, dst = 0; + + for (i = 0; i < nr; i++) + for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + ja->buckets[dst++] = + le64_to_cpu(journal_buckets_v2->d[i].start) + j; + } else if (journal_buckets) { + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + } + + return 0; +} + +void bch2_fs_journal_exit(struct journal *j) +{ + unsigned i; + + darray_exit(&j->early_journal_entries); + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) + kvpfree(j->buf[i].data, j->buf[i].buf_size); + free_fifo(&j->pin); +} + +int bch2_fs_journal_init(struct journal *j) +{ + static struct lock_class_key res_key; + unsigned i; + + spin_lock_init(&j->lock); + spin_lock_init(&j->err_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); + init_waitqueue_head(&j->reclaim_wait); + init_waitqueue_head(&j->pin_flush_wait); + mutex_init(&j->reclaim_lock); + mutex_init(&j->discard_lock); + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) + return -BCH_ERR_ENOMEM_journal_pin_fifo; + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + if (!j->buf[i].data) + return -BCH_ERR_ENOMEM_journal_buf; + } + + j->pin.front = j->pin.back = 1; + return 0; +} + +/* debug: */ + +void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state s; + struct bch_dev *ca; + unsigned long now = jiffies; + u64 seq; + unsigned i; + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + out->atomic++; + + rcu_read_lock(); + s = READ_ONCE(j->reservations); + + prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); + prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); + prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); + prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); + prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry:\t\t"); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: + prt_printf(out, "error"); + break; + case JOURNAL_ENTRY_CLOSED_VAL: + prt_printf(out, "closed"); + break; + default: + prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + break; + } + + prt_newline(out); + + for (seq = journal_cur_seq(j); + seq >= journal_last_unwritten_seq(j); + --seq) { + i = seq & JOURNAL_BUF_MASK; + + prt_printf(out, "unwritten entry:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); + + prt_printf(out, "sectors:"); + prt_tab(out); + prt_printf(out, "%u", j->buf[i].sectors); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); + } + + prt_printf(out, + "replay done:\t\t%i\n", + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + prt_printf(out, "space:\n"); + prt_printf(out, "\tdiscarded\t%u:%u\n", + j->space[journal_space_discarded].next_entry, + j->space[journal_space_discarded].total); + prt_printf(out, "\tclean ondisk\t%u:%u\n", + j->space[journal_space_clean_ondisk].next_entry, + j->space[journal_space_clean_ondisk].total); + prt_printf(out, "\tclean\t\t%u:%u\n", + j->space[journal_space_clean].next_entry, + j->space[journal_space_clean].total); + prt_printf(out, "\ttotal\t\t%u:%u\n", + j->space[journal_space_total].next_entry, + j->space[journal_space_total].total); + + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + + if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) + continue; + + if (!ja->nr) + continue; + + prt_printf(out, "dev %u:\n", i); + prt_printf(out, "\tnr\t\t%u\n", ja->nr); + prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); + prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); + prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + } + + rcu_read_unlock(); + + --out->atomic; +} + +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +{ + spin_lock(&j->lock); + __bch2_journal_debug_to_text(out, j); + spin_unlock(&j->lock); +} + +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + unsigned i; + + spin_lock(&j->lock); + *seq = max(*seq, j->pin.front); + + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + + prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); + prt_newline(out); + printbuf_indent_add(out, 2); + + for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + list_for_each_entry(pin, &pin_list->list[i], list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } + + if (!list_empty(&pin_list->flushed)) { + prt_printf(out, "flushed:"); + prt_newline(out); + } + + list_for_each_entry(pin, &pin_list->flushed, list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } + + printbuf_indent_sub(out, 2); + + --out->atomic; + spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; +} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h new file mode 100644 index 000000000000..2f768e11aec9 --- /dev/null +++ b/fs/bcachefs/journal.h @@ -0,0 +1,450 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H + +/* + * THE JOURNAL: + * + * The primary purpose of the journal is to log updates (insertions) to the + * b-tree, to avoid having to do synchronous updates to the b-tree on disk. + * + * Without the journal, the b-tree is always internally consistent on + * disk - and in fact, in the earliest incarnations bcache didn't have a journal + * but did handle unclean shutdowns by doing all index updates synchronously + * (with coalescing). + * + * Updates to interior nodes still happen synchronously and without the journal + * (for simplicity) - this may change eventually but updates to interior nodes + * are rare enough it's not a huge priority. + * + * This means the journal is relatively separate from the b-tree; it consists of + * just a list of keys and journal replay consists of just redoing those + * insertions in same order that they appear in the journal. + * + * PERSISTENCE: + * + * For synchronous updates (where we're waiting on the index update to hit + * disk), the journal entry will be written out immediately (or as soon as + * possible, if the write for the previous journal entry was still in flight). + * + * Synchronous updates are specified by passing a closure (@flush_cl) to + * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter + * down to the journalling code. That closure will wait on the journal write to + * complete (via closure_wait()). + * + * If the index update wasn't synchronous, the journal entry will be + * written out after 10 ms have elapsed, by default (the delay_ms field + * in struct journal). + * + * JOURNAL ENTRIES: + * + * A journal entry is variable size (struct jset), it's got a fixed length + * header and then a variable number of struct jset_entry entries. + * + * Journal entries are identified by monotonically increasing 64 bit sequence + * numbers - jset->seq; other places in the code refer to this sequence number. + * + * A jset_entry entry contains one or more bkeys (which is what gets inserted + * into the b-tree). We need a container to indicate which b-tree the key is + * for; also, the roots of the various b-trees are stored in jset_entry entries + * (one for each b-tree) - this lets us add new b-tree types without changing + * the on disk format. + * + * We also keep some things in the journal header that are logically part of the + * superblock - all the things that are frequently updated. This is for future + * bcache on raw flash support; the superblock (which will become another + * journal) can't be moved or wear leveled, so it contains just enough + * information to find the main journal, and the superblock only has to be + * rewritten when we want to move/wear level the main journal. + * + * JOURNAL LAYOUT ON DISK: + * + * The journal is written to a ringbuffer of buckets (which is kept in the + * superblock); the individual buckets are not necessarily contiguous on disk + * which means that journal entries are not allowed to span buckets, but also + * that we can resize the journal at runtime if desired (unimplemented). + * + * The journal buckets exist in the same pool as all the other buckets that are + * managed by the allocator and garbage collection - garbage collection marks + * the journal buckets as metadata buckets. + * + * OPEN/DIRTY JOURNAL ENTRIES: + * + * Open/dirty journal entries are journal entries that contain b-tree updates + * that have not yet been written out to the b-tree on disk. We have to track + * which journal entries are dirty, and we also have to avoid wrapping around + * the journal and overwriting old but still dirty journal entries with new + * journal entries. + * + * On disk, this is represented with the "last_seq" field of struct jset; + * last_seq is the first sequence number that journal replay has to replay. + * + * To avoid overwriting dirty journal entries on disk, we keep a mapping (in + * journal_device->seq) of for each journal bucket, the highest sequence number + * any journal entry it contains. Then, by comparing that against last_seq we + * can determine whether that journal bucket contains dirty journal entries or + * not. + * + * To track which journal entries are dirty, we maintain a fifo of refcounts + * (where each entry corresponds to a specific sequence number) - when a ref + * goes to 0, that journal entry is no longer dirty. + * + * Journalling of index updates is done at the same time as the b-tree itself is + * being modified (see btree_insert_key()); when we add the key to the journal + * the pending b-tree write takes a ref on the journal entry the key was added + * to. If a pending b-tree write would need to take refs on multiple dirty + * journal entries, it only keeps the ref on the oldest one (since a newer + * journal entry will still be replayed if an older entry was dirty). + * + * JOURNAL FILLING UP: + * + * There are two ways the journal could fill up; either we could run out of + * space to write to, or we could have too many open journal entries and run out + * of room in the fifo of refcounts. Since those refcounts are decremented + * without any locking we can't safely resize that fifo, so we handle it the + * same way. + * + * If the journal fills up, we start flushing dirty btree nodes until we can + * allocate space for a journal write again - preferentially flushing btree + * nodes that are pinning the oldest journal entries first. + */ + +#include <linux/hash.h> + +#include "journal_types.h" + +struct bch_fs; + +static inline void journal_wake(struct journal *j) +{ + wake_up(&j->wait); + closure_wake_up(&j->async_wait); + closure_wake_up(&j->preres_wait); +} + +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + return j->buf + j->reservations.idx; +} + +/* Sequence number of oldest dirty journal entry */ + +static inline u64 journal_last_seq(struct journal *j) +{ + return j->pin.front; +} + +static inline u64 journal_cur_seq(struct journal *j) +{ + return atomic64_read(&j->seq); +} + +static inline u64 journal_last_unwritten_seq(struct journal *j) +{ + return j->seq_ondisk + 1; +} + +static inline int journal_state_count(union journal_res_state s, int idx) +{ + switch (idx) { + case 0: return s.buf0_count; + case 1: return s.buf1_count; + case 2: return s.buf2_count; + case 3: return s.buf3_count; + } + BUG(); +} + +static inline void journal_state_inc(union journal_res_state *s) +{ + s->buf0_count += s->idx == 0; + s->buf1_count += s->idx == 1; + s->buf2_count += s->idx == 2; + s->buf3_count += s->idx == 3; +} + +/* + * Amount of space that will be taken up by some keys in the journal (i.e. + * including the jset header) + */ +static inline unsigned jset_u64s(unsigned u64s) +{ + return u64s + sizeof(struct jset_entry) / sizeof(u64); +} + +static inline int journal_entry_overhead(struct journal *j) +{ + return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; +} + +static inline struct jset_entry * +bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) +{ + struct jset *jset = buf->data; + struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); + + memset(entry, 0, sizeof(*entry)); + entry->u64s = cpu_to_le16(u64s); + + le32_add_cpu(&jset->u64s, jset_u64s(u64s)); + + return entry; +} + +static inline struct jset_entry * +journal_res_entry(struct journal *j, struct journal_res *res) +{ + return vstruct_idx(j->buf[res->idx].data, res->offset); +} + +static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, + enum btree_id id, unsigned level, + unsigned u64s) +{ + entry->u64s = cpu_to_le16(u64s); + entry->btree_id = id; + entry->level = level; + entry->type = type; + entry->pad[0] = 0; + entry->pad[1] = 0; + entry->pad[2] = 0; + return jset_u64s(u64s); +} + +static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, + enum btree_id id, unsigned level, + const void *data, unsigned u64s) +{ + unsigned ret = journal_entry_init(entry, type, id, level, u64s); + + memcpy_u64s_small(entry->_data, data, u64s); + return ret; +} + +static inline struct jset_entry * +bch2_journal_add_entry(struct journal *j, struct journal_res *res, + unsigned type, enum btree_id id, + unsigned level, unsigned u64s) +{ + struct jset_entry *entry = journal_res_entry(j, res); + unsigned actual = journal_entry_init(entry, type, id, level, u64s); + + EBUG_ON(!res->ref); + EBUG_ON(actual > res->u64s); + + res->offset += actual; + res->u64s -= actual; + return entry; +} + +static inline bool journal_entry_empty(struct jset *j) +{ + struct jset_entry *i; + + if (j->seq != j->last_seq) + return false; + + vstruct_for_each(j, i) + if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) + return false; + return true; +} + +/* + * Drop reference on a buffer index and return true if the count has hit zero. + */ +static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) +{ + union journal_res_state s; + + s.v = atomic64_sub_return(((union journal_res_state) { + .buf0_count = idx == 0, + .buf1_count = idx == 1, + .buf2_count = idx == 2, + .buf3_count = idx == 3, + }).v, &j->reservations.counter); + return s; +} + +bool bch2_journal_entry_close(struct journal *); +void bch2_journal_buf_put_final(struct journal *, u64, bool); + +static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; + + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); +} + +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; + + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) { + spin_lock(&j->lock); + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + spin_unlock(&j->lock); + } +} + +/* + * This function releases the journal write structure so other threads can + * then proceed to add their keys as well. + */ +static inline void bch2_journal_res_put(struct journal *j, + struct journal_res *res) +{ + if (!res->ref) + return; + + lock_release(&j->res_map, _THIS_IP_); + + while (res->u64s) + bch2_journal_add_entry(j, res, + BCH_JSET_ENTRY_btree_keys, + 0, 0, 0); + + bch2_journal_buf_put(j, res->idx, res->seq); + + res->ref = 0; +} + +int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, + unsigned); + +/* First bits for BCH_WATERMARK: */ +enum journal_res_flags { + __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, + __JOURNAL_RES_GET_CHECK, +}; + +#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) +#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) + +static inline int journal_res_get_fast(struct journal *j, + struct journal_res *res, + unsigned flags) +{ + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + do { + old.v = new.v = v; + + /* + * Check if there is still room in the current journal + * entry: + */ + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) + return 0; + + EBUG_ON(!journal_state_count(new, new.idx)); + + if ((flags & BCH_WATERMARK_MASK) < j->watermark) + return 0; + + new.cur_entry_offset += res->u64s; + journal_state_inc(&new); + + /* + * If the refcount would overflow, we have to wait: + * XXX - tracepoint this: + */ + if (!journal_state_count(new, new.idx)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + res->ref = true; + res->idx = old.idx; + res->offset = old.cur_entry_offset; + res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + return 1; +} + +static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, + unsigned u64s, unsigned flags) +{ + int ret; + + EBUG_ON(res->ref); + EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + + res->u64s = u64s; + + if (journal_res_get_fast(j, res, flags)) + goto out; + + ret = bch2_journal_res_get_slowpath(j, res, flags); + if (ret) + return ret; +out: + if (!(flags & JOURNAL_RES_GET_CHECK)) { + lock_acquire_shared(&j->res_map, 0, + (flags & JOURNAL_RES_GET_NONBLOCK) != 0, + NULL, _THIS_IP_); + EBUG_ON(!res->ref); + } + return 0; +} + +/* journal_entry_res: */ + +void bch2_journal_entry_res_resize(struct journal *, + struct journal_entry_res *, + unsigned); + +int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +void bch2_journal_flush_async(struct journal *, struct closure *); + +int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush(struct journal *); +bool bch2_journal_noflush_seq(struct journal *, u64); +int bch2_journal_meta(struct journal *); + +void bch2_journal_halt(struct journal *); + +static inline int bch2_journal_error(struct journal *j) +{ + return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL + ? -EIO : 0; +} + +struct bch_dev; + +static inline void bch2_journal_set_replay_done(struct journal *j) +{ + BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + set_bit(JOURNAL_REPLAY_DONE, &j->flags); +} + +void bch2_journal_unblock(struct journal *); +void bch2_journal_block(struct journal *); + +void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); +int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_fs_journal_alloc(struct bch_fs *); + +void bch2_dev_journal_stop(struct journal *, struct bch_dev *); + +void bch2_fs_journal_stop(struct journal *); +int bch2_fs_journal_start(struct journal *, u64); + +void bch2_dev_journal_exit(struct bch_dev *); +int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); +void bch2_fs_journal_exit(struct journal *); +int bch2_fs_journal_init(struct journal *); + +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 index 000000000000..5de1b68fb8af --- /dev/null +++ b/fs/bcachefs/journal_io.c @@ -0,0 +1,1964 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_io.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "checksum.h" +#include "disk_groups.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "replicas.h" +#include "sb-clean.h" +#include "trace.h" + +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + +static bool jset_csum_good(struct bch_fs *c, struct jset *j) +{ + return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && + !bch2_crc_cmp(j->csum, + csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); +} + +static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) +{ + return (seq - c->journal_entries_base_seq) & (~0U >> 1); +} + +static void __journal_replay_free(struct bch_fs *c, + struct journal_replay *i) +{ + struct journal_replay **p = + genradix_ptr(&c->journal_entries, + journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); + + BUG_ON(*p != i); + *p = NULL; + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(c, i); +} + +struct journal_list { + struct closure cl; + u64 last_seq; + struct mutex lock; + int ret; +}; + +#define JOURNAL_ENTRY_ADD_OK 0 +#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 + +/* + * Given a journal entry we just read, add it to the list of journal entries to + * be replayed: + */ +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct journal_ptr entry_ptr, + struct journal_list *jlist, struct jset *j) +{ + struct genradix_iter iter; + struct journal_replay **_i, *i, *dup; + struct journal_ptr *ptr; + size_t bytes = vstruct_bytes(j); + u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; + int ret = JOURNAL_ENTRY_ADD_OK; + + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < jlist->last_seq) + return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + + /* + * genradixes are indexed by a ulong, not a u64, so we can't index them + * by sequence number directly: Assume instead that they will all fall + * within the range of +-2billion of the filrst one we find. + */ + if (!c->journal_entries_base_seq) + c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); + + /* Drop entries we don't need anymore */ + if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { + genradix_for_each_from(&c->journal_entries, iter, _i, + journal_entry_radix_idx(c, jlist->last_seq)) { + i = *_i; + + if (!i || i->ignore) + continue; + + if (le64_to_cpu(i->j.seq) >= last_seq) + break; + journal_replay_free(c, i); + } + } + + jlist->last_seq = max(jlist->last_seq, last_seq); + + _i = genradix_ptr_alloc(&c->journal_entries, + journal_entry_radix_idx(c, le64_to_cpu(j->seq)), + GFP_KERNEL); + if (!_i) + return -BCH_ERR_ENOMEM_journal_entry_add; + + /* + * Duplicate journal entries? If so we want the one that didn't have a + * checksum error: + */ + dup = *_i; + if (dup) { + if (bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes)) { + i = dup; + goto found; + } + + if (!entry_ptr.csum_good) { + i = dup; + goto found; + } + + if (!dup->csum_good) + goto replace; + + fsck_err(c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); + i = dup; + goto found; + } +replace: + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) + return -BCH_ERR_ENOMEM_journal_entry_add; + + i->nr_ptrs = 0; + i->csum_good = entry_ptr.csum_good; + i->ignore = false; + unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); + i->ptrs[i->nr_ptrs++] = entry_ptr; + + if (dup) { + if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; + } + + /* The first ptr should represent the jset we kept: */ + memcpy(i->ptrs + i->nr_ptrs, + dup->ptrs, + sizeof(dup->ptrs[0]) * dup->nr_ptrs); + i->nr_ptrs += dup->nr_ptrs; + __journal_replay_free(c, dup); + } + + *_i = i; + return 0; +found: + for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { + if (ptr->dev == ca->dev_idx) { + bch_err(c, "duplicate journal entry %llu on same device", + le64_to_cpu(i->j.seq)); + goto out; + } + } + + if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + goto out; + } + + i->ptrs[i->nr_ptrs++] = entry_ptr; +out: +fsck_err: + return ret; +} + +/* this fills in a range with empty jset_entries: */ +static void journal_entry_null_range(void *start, void *end) +{ + struct jset_entry *entry; + + for (entry = start; entry != end; entry = vstruct_next(entry)) + memset(entry, 0, sizeof(*entry)); +} + +#define JOURNAL_ENTRY_REREAD 5 +#define JOURNAL_ENTRY_NONE 6 +#define JOURNAL_ENTRY_BAD 7 + +static void journal_entry_err_msg(struct printbuf *out, + u32 version, + struct jset *jset, + struct jset_entry *entry) +{ + prt_str(out, "invalid journal entry, version="); + bch2_version_to_text(out, version); + + if (entry) { + prt_str(out, " type="); + prt_str(out, bch2_jset_entry_types[entry->type]); + } + + if (!jset) { + prt_printf(out, " in superblock"); + } else { + + prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); + + if (entry) + prt_printf(out, " offset=%zi/%u", + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s)); + } + + prt_str(out, ": "); +} + +#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ +({ \ + struct printbuf _buf = PRINTBUF; \ + \ + journal_entry_err_msg(&_buf, version, jset, entry); \ + prt_printf(&_buf, msg, ##__VA_ARGS__); \ + \ + switch (flags & BKEY_INVALID_WRITE) { \ + case READ: \ + mustfix_fsck_err(c, _err, "%s", _buf.buf); \ + break; \ + case WRITE: \ + bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ + bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ + if (bch2_fs_inconsistent(c)) { \ + ret = -BCH_ERR_fsck_errors_not_fixed; \ + goto fsck_err; \ + } \ + break; \ + } \ + \ + printbuf_exit(&_buf); \ + true; \ +}) + +#define journal_entry_err_on(cond, ...) \ + ((cond) ? journal_entry_err(__VA_ARGS__) : false) + +#define FSCK_DELETED_KEY 5 + +static int journal_validate_key(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned level, enum btree_id btree_id, + struct bkey_i *k, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + int write = flags & BKEY_INVALID_WRITE; + void *next = vstruct_next(entry); + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, + c, version, jset, entry, + journal_entry_bkey_u64s_0, + "k->u64s 0")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), + c, version, jset, entry, + journal_entry_bkey_past_end, + "extends past end of journal entry")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, + c, version, jset, entry, + journal_entry_bkey_bad_format, + "bad format %u", k->k.format)) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; + } + + if (!write) + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); + + if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf)) { + printbuf_reset(&buf); + journal_entry_err_msg(&buf, version, jset, entry); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + prt_newline(&buf); + bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf); + + mustfix_fsck_err(c, journal_entry_bkey_invalid, + "%s", buf.buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + + printbuf_exit(&buf); + return FSCK_DELETED_KEY; + } + + if (write) + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int journal_entry_btree_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + struct bkey_i *k = entry->start; + + while (k != vstruct_last(entry)) { + int ret = journal_validate_key(c, jset, entry, + entry->level, + entry->btree_id, + k, version, big_endian, + flags|BKEY_INVALID_JOURNAL); + if (ret == FSCK_DELETED_KEY) + continue; + + k = bkey_next(k); + } + + return 0; +} + +static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct bkey_i *k; + bool first = true; + + jset_entry_for_each_key(entry, k) { + if (!first) { + prt_newline(out); + prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + } + prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); + first = false; + } +} + +static int journal_entry_btree_root_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + struct bkey_i *k = entry->start; + int ret = 0; + + if (journal_entry_err_on(!entry->u64s || + le16_to_cpu(entry->u64s) != k->k.u64s, + c, version, jset, entry, + journal_entry_btree_root_bad_size, + "invalid btree root journal entry: wrong number of keys")) { + void *next = vstruct_next(entry); + /* + * we don't want to null out this jset_entry, + * just the contents, so that later we can tell + * we were _supposed_ to have a btree root + */ + entry->u64s = 0; + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, + version, big_endian, flags); +fsck_err: + return ret; +} + +static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + +static int journal_entry_prio_ptrs_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + /* obsolete, don't care: */ + return 0; +} + +static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ +} + +static int journal_entry_blacklist_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, + c, version, jset, entry, + journal_entry_blacklist_bad_size, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } +fsck_err: + return ret; +} + +static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist *bl = + container_of(entry, struct jset_entry_blacklist, entry); + + prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); +} + +static int journal_entry_blacklist_v2_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + struct jset_entry_blacklist_v2 *bl_entry; + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, + c, version, jset, entry, + journal_entry_blacklist_v2_bad_size, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; + } + + bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); + + if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > + le64_to_cpu(bl_entry->end), + c, version, jset, entry, + journal_entry_blacklist_v2_start_past_end, + "invalid journal seq blacklist entry: start > end")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } +out: +fsck_err: + return ret; +} + +static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist_v2 *bl = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + prt_printf(out, "start=%llu end=%llu", + le64_to_cpu(bl->start), + le64_to_cpu(bl->end)); +} + +static int journal_entry_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u), + c, version, jset, entry, + journal_entry_usage_bad_size, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + prt_printf(out, "type=%s v=%llu", + bch2_fs_usage_types[u->entry.btree_id], + le64_to_cpu(u->v)); +} + +static int journal_entry_data_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + struct printbuf err = PRINTBUF; + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u) || + bytes < sizeof(*u) + u->r.nr_devs, + c, version, jset, entry, + journal_entry_data_usage_bad_size, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; + } + + if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), + c, version, jset, entry, + journal_entry_data_usage_bad_size, + "invalid journal entry usage: %s", err.buf)) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; + } +out: +fsck_err: + printbuf_exit(&err); + return ret; +} + +static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + + bch2_replicas_entry_to_text(out, &u->r); + prt_printf(out, "=%llu", le64_to_cpu(u->v)); +} + +static int journal_entry_clock_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), + c, version, jset, entry, + journal_entry_clock_bad_size, + "bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, + c, version, jset, entry, + journal_entry_clock_bad_rw, + "bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); +} + +static int journal_entry_dev_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned expected = sizeof(*u); + unsigned dev; + int ret = 0; + + if (journal_entry_err_on(bytes < expected, + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), + c, version, jset, entry, + journal_entry_dev_usage_bad_dev, + "bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, + c, version, jset, entry, + journal_entry_dev_usage_bad_pad, + "bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); + + for (i = 0; i < nr_types; i++) { + if (i < BCH_DATA_NR) + prt_printf(out, " %s", bch2_data_types[i]); + else + prt_printf(out, " (unknown data type %u)", i); + prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", + le64_to_cpu(u->d[i].buckets), + le64_to_cpu(u->d[i].sectors), + le64_to_cpu(u->d[i].fragmented)); + } + + prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); +} + +static int journal_entry_log_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return 0; +} + +static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); + unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); + + prt_printf(out, "%.*s", bytes, l->d); +} + +static int journal_entry_overwrite_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + +struct jset_entry_ops { + int (*validate)(struct bch_fs *, struct jset *, + struct jset_entry *, unsigned, int, + enum bkey_invalid_flags); + void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); +}; + +static const struct jset_entry_ops bch2_jset_entry_ops[] = { +#define x(f, nr) \ + [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ + .validate = journal_entry_##f##_validate, \ + .to_text = journal_entry_##f##_to_text, \ + }, + BCH_JSET_ENTRY_TYPES() +#undef x +}; + +int bch2_journal_entry_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return entry->type < BCH_JSET_ENTRY_NR + ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, + version, big_endian, flags) + : 0; +} + +void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + if (entry->type < BCH_JSET_ENTRY_NR) { + prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_jset_entry_ops[entry->type].to_text(out, c, entry); + } else { + prt_printf(out, "(unknown type %u)", entry->type); + } +} + +static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + enum bkey_invalid_flags flags) +{ + struct jset_entry *entry; + unsigned version = le32_to_cpu(jset->version); + int ret = 0; + + vstruct_for_each(jset, entry) { + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), + c, version, jset, entry, + journal_entry_past_jset_end, + "journal entry extends past end of jset")) { + jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); + break; + } + + ret = bch2_journal_entry_validate(c, jset, entry, + version, JSET_BIG_ENDIAN(jset), flags); + if (ret) + break; + } +fsck_err: + return ret; +} + +static int jset_validate(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, + enum bkey_invalid_flags flags) +{ + unsigned version; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, + jset_unsupported_version, + "%s sector %llu seq %llu: incompatible journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version))) { + /* don't try to continue: */ + return -EINVAL; + } + + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), + c, version, jset, NULL, + jset_unknown_csum, + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + JSET_CSUM_TYPE(jset))) + ret = JOURNAL_ENTRY_BAD; + + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), + c, version, jset, NULL, + jset_last_seq_newer_than_seq, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(jset->last_seq), + le64_to_cpu(jset->seq))) { + jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } + + ret = jset_validate_entries(c, jset, flags); +fsck_err: + return ret; +} + +static int jset_validate_early(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, + unsigned bucket_sectors_left, + unsigned sectors_read) +{ + size_t bytes = vstruct_bytes(jset); + unsigned version; + enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, + jset_unsupported_version, + "%s sector %llu seq %llu: unknown journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version))) { + /* don't try to continue: */ + return -EINVAL; + } + + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, + c, version, jset, NULL, + jset_past_bucket_end, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); +fsck_err: + return ret; +} + +struct journal_read_buf { + void *data; + size_t size; +}; + +static int journal_read_buf_realloc(struct journal_read_buf *b, + size_t new_size) +{ + void *n; + + /* the bios are sized for this many pages, max: */ + if (new_size > JOURNAL_ENTRY_SIZE_MAX) + return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + + new_size = roundup_pow_of_two(new_size); + n = kvpmalloc(new_size, GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + + kvpfree(b->data, b->size); + b->data = n; + b->size = new_size; + return 0; +} + +static int journal_read_bucket(struct bch_dev *ca, + struct journal_read_buf *buf, + struct journal_list *jlist, + unsigned bucket) +{ + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; + struct jset *j = NULL; + unsigned sectors, sectors_read = 0; + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; + bool saw_bad = false, csum_good; + int ret = 0; + + pr_debug("reading %u", bucket); + + while (offset < end) { + if (!sectors_read) { + struct bio *bio; + unsigned nr_bvecs; +reread: + sectors_read = min_t(unsigned, + end - offset, buf->size >> 9); + nr_bvecs = buf_pages(buf->data, sectors_read << 9); + + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); + + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, buf->data, sectors_read << 9); + + ret = submit_bio_wait(bio); + kfree(bio); + + if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, + "journal read error: sector %llu", + offset) || + bch2_meta_read_fault("journal")) { + /* + * We don't error out of the recovery process + * here, since the relevant journal entry may be + * found on a different device, and missing or + * no journal entries will be handled later + */ + return 0; + } + + j = buf->data; + } + + ret = jset_validate_early(c, ca, j, offset, + end - offset, sectors_read); + switch (ret) { + case 0: + sectors = vstruct_sectors(j, c->block_bits); + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { + ret = journal_read_buf_realloc(buf, + vstruct_bytes(j)); + if (ret) + return ret; + } + goto reread; + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; + /* + * On checksum error we don't really trust the size + * field of the journal entry we read, so try reading + * again at next block boundary: + */ + sectors = block_sectors(c); + goto next_block; + default: + return ret; + } + + /* + * This happens sometimes if we don't have discards on - + * when we've partially overwritten a bucket with new + * journal entries. We don't need the rest of the + * bucket: + */ + if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) + return 0; + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + csum_good = jset_csum_good(c, j); + if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, + "journal checksum error")) + saw_bad = true; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), + j->encrypted_start, + vstruct_end(j) - (void *) j->encrypted_start); + bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret); + + mutex_lock(&jlist->lock); + ret = journal_entry_add(c, ca, (struct journal_ptr) { + .csum_good = csum_good, + .dev = ca->dev_idx, + .bucket = bucket, + .bucket_offset = offset - + bucket_to_sector(ca, ja->buckets[bucket]), + .sector = offset, + }, jlist, j); + mutex_unlock(&jlist->lock); + + switch (ret) { + case JOURNAL_ENTRY_ADD_OK: + break; + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + default: + return ret; + } +next_block: + pr_debug("next"); + offset += sectors; + sectors_read -= sectors; + j = ((void *) j) + (sectors << 9); + } + + return 0; +} + +static CLOSURE_CALLBACK(bch2_journal_read_device) +{ + closure_type(ja, struct journal_device, read); + struct bch_dev *ca = container_of(ja, struct bch_dev, journal); + struct bch_fs *c = ca->fs; + struct journal_list *jlist = + container_of(cl->parent, struct journal_list, cl); + struct journal_replay *r, **_r; + struct genradix_iter iter; + struct journal_read_buf buf = { NULL, 0 }; + unsigned i; + int ret = 0; + + if (!ja->nr) + goto out; + + ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + if (ret) + goto err; + + pr_debug("%u journal buckets", ja->nr); + + for (i = 0; i < ja->nr; i++) { + ret = journal_read_bucket(ca, &buf, jlist, i); + if (ret) + goto err; + } + + ja->sectors_free = ca->mi.bucket_size; + + mutex_lock(&jlist->lock); + genradix_for_each_reverse(&c->journal_entries, iter, _r) { + r = *_r; + + if (!r) + continue; + + for (i = 0; i < r->nr_ptrs; i++) { + if (r->ptrs[i].dev == ca->dev_idx) { + unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + + vstruct_sectors(&r->j, c->block_bits); + + ja->cur_idx = r->ptrs[i].bucket; + ja->sectors_free = ca->mi.bucket_size - wrote; + goto found; + } + } + } +found: + mutex_unlock(&jlist->lock); + + if (ja->bucket_seq[ja->cur_idx] && + ja->sectors_free == ca->mi.bucket_size) { +#if 0 + /* + * Debug code for ZNS support, where we (probably) want to be + * correlated where we stopped in the journal to the zone write + * points: + */ + bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); + bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); + for (i = 0; i < 3; i++) { + unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; + + bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); + } +#endif + ja->sectors_free = 0; + } + + /* + * Set dirty_idx to indicate the entire journal is full and needs to be + * reclaimed - journal reclaim will immediately reclaim whatever isn't + * pinned when it first runs: + */ + ja->discard_idx = ja->dirty_idx_ondisk = + ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; +out: + bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); + kvpfree(buf.data, buf.size); + percpu_ref_put(&ca->io_ref); + closure_return(cl); + return; +err: + mutex_lock(&jlist->lock); + jlist->ret = ret; + mutex_unlock(&jlist->lock); + goto out; +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + unsigned i; + + for (i = 0; i < j->nr_ptrs; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); + u64 offset; + + div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); + + if (i) + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", + j->ptrs[i].dev, + j->ptrs[i].bucket, + j->ptrs[i].bucket_offset, + j->ptrs[i].sector); + } +} + +int bch2_journal_read(struct bch_fs *c, + u64 *last_seq, + u64 *blacklist_seq, + u64 *start_seq) +{ + struct journal_list jlist; + struct journal_replay *i, **_i, *prev = NULL; + struct genradix_iter radix_iter; + struct bch_dev *ca; + unsigned iter; + struct printbuf buf = PRINTBUF; + bool degraded = false, last_write_torn = false; + u64 seq; + int ret = 0; + + closure_init_stack(&jlist.cl); + mutex_init(&jlist.lock); + jlist.last_seq = 0; + jlist.ret = 0; + + for_each_member_device(ca, c, iter) { + if (!c->opts.fsck && + !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) + continue; + + if ((ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro) && + percpu_ref_tryget(&ca->io_ref)) + closure_call(&ca->journal.read, + bch2_journal_read_device, + system_unbound_wq, + &jlist.cl); + else + degraded = true; + } + + closure_sync(&jlist.cl); + + if (jlist.ret) + return jlist.ret; + + *last_seq = 0; + *start_seq = 0; + *blacklist_seq = 0; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { + enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + + i = *_i; + + if (!i || i->ignore) + continue; + + if (!*start_seq) + *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; + + if (JSET_NO_FLUSH(&i->j)) { + i->ignore = true; + continue; + } + + if (!last_write_torn && !i->csum_good) { + last_write_torn = true; + i->ignore = true; + continue; + } + + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), + c, le32_to_cpu(i->j.version), &i->j, NULL, + jset_last_seq_newer_than_seq, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq))) + i->j.last_seq = i->j.seq; + + *last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + if (!*start_seq) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + if (!*last_seq) { + fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, + "journal read done, but no entries found after dropping non-flushes"); + return 0; + } + + bch_info(c, "journal read done, replaying entries %llu-%llu", + *last_seq, *blacklist_seq - 1); + + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); + + /* Drop blacklisted entries and entries older than last_seq: */ + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < *last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + jset_seq_blacklisted, + "found blacklisted journal entry %llu", seq); + i->ignore = true; + } + } + + /* Check for missing entries: */ + seq = *last_seq; + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (prev) { + bch2_journal_ptrs_to_text(&buf1, c, prev); + prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); + } else + prt_printf(&buf1, "(none)"); + bch2_journal_ptrs_to_text(&buf2, c, i); + + missing_end = seq - 1; + fsck_err(c, journal_entries_missing, + "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + " prev at %s\n" + " next at %s", + missing_start, missing_end, + *last_seq, *blacklist_seq - 1, + buf1.buf, buf2.buf); + + printbuf_exit(&buf1); + printbuf_exit(&buf2); + } + + prev = i; + seq++; + } + + genradix_for_each(&c->journal_entries, radix_iter, _i) { + struct bch_replicas_padded replicas = { + .e.data_type = BCH_DATA_journal, + .e.nr_required = 1, + }; + unsigned ptr; + + i = *_i; + if (!i || i->ignore) + continue; + + for (ptr = 0; ptr < i->nr_ptrs; ptr++) { + ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + + if (!i->ptrs[ptr].csum_good) + bch_err_dev_offset(ca, i->ptrs[ptr].sector, + "invalid journal checksum, seq %llu%s", + le64_to_cpu(i->j.seq), + i->csum_good ? " (had good copy on another device)" : ""); + } + + ret = jset_validate(c, + bch_dev_bkey_exists(c, i->ptrs[0].dev), + &i->j, + i->ptrs[0].sector, + READ); + if (ret) + goto err; + + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + + bch2_replicas_entry_sort(&replicas.e); + + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, &replicas.e); + + if (!degraded && + !bch2_replicas_marked(c, &replicas.e) && + (le64_to_cpu(i->j.seq) == *last_seq || + fsck_err(c, journal_entry_replicas_not_marked, + "superblock not marked as containing replicas for journal entry %llu\n %s", + le64_to_cpu(i->j.seq), buf.buf))) { + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) + goto err; + } + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* journal write: */ + +static void __journal_write_alloc(struct journal *j, + struct journal_buf *w, + struct dev_alloc_list *devs_sorted, + unsigned sectors, + unsigned *replicas, + unsigned replicas_want) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_device *ja; + struct bch_dev *ca; + unsigned i; + + if (*replicas >= replicas_want) + return; + + for (i = 0; i < devs_sorted->nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + if (!ca) + continue; + + ja = &ca->journal; + + /* + * Check that we can use this device, and aren't already using + * it: + */ + if (!ca->mi.durability || + ca->mi.state != BCH_MEMBER_STATE_rw || + !ja->nr || + bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || + sectors > ja->sectors_free) + continue; + + bch2_dev_stripe_increment(ca, &j->wp.stripe); + + bch2_bkey_append_ptr(&w->key, + (struct bch_extent_ptr) { + .offset = bucket_to_sector(ca, + ja->buckets[ja->cur_idx]) + + ca->mi.bucket_size - + ja->sectors_free, + .dev = ca->dev_idx, + }); + + ja->sectors_free -= sectors; + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + + *replicas += ca->mi.durability; + + if (*replicas >= replicas_want) + break; + } +} + +/** + * journal_write_alloc - decide where to write next journal entry + * + * @j: journal object + * @w: journal buf (entry to be written) + * + * Returns: 0 on success, or -EROFS on failure + */ +static int journal_write_alloc(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_devs_mask devs; + struct journal_device *ja; + struct bch_dev *ca; + struct dev_alloc_list devs_sorted; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + unsigned target = c->opts.metadata_target ?: + c->opts.foreground_target; + unsigned i, replicas = 0, replicas_want = + READ_ONCE(c->opts.metadata_replicas); + + rcu_read_lock(); +retry: + devs = target_rw_devs(c, BCH_DATA_journal, target); + + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); + + if (replicas >= replicas_want) + goto done; + + for (i = 0; i < devs_sorted.nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + if (!ca) + continue; + + ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + } + } + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); + + if (replicas < replicas_want && target) { + /* Retry from all devices: */ + target = 0; + goto retry; + } +done: + rcu_read_unlock(); + + BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); + + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; +} + +static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +{ + /* we aren't holding j->lock: */ + unsigned new_size = READ_ONCE(j->buf_size_want); + void *new_buf; + + if (buf->buf_size >= new_size) + return; + + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); + if (!new_buf) + return; + + memcpy(new_buf, buf->data, buf->buf_size); + + spin_lock(&j->lock); + swap(buf->data, new_buf); + swap(buf->buf_size, new_size); + spin_unlock(&j->lock); + + kvpfree(new_buf, new_size); +} + +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +{ + return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); +} + +static CLOSURE_CALLBACK(journal_write_done) +{ + closure_type(j, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_replicas_padded replicas; + union journal_res_state old, new; + u64 v, seq; + int err = 0; + + bch2_time_stats_update(!JSET_NO_FLUSH(w->data) + ? j->flush_write_time + : j->noflush_write_time, j->write_start_time); + + if (!w->devs_written.nr) { + bch_err(c, "unable to write journal to sufficient devices"); + err = -EIO; + } else { + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + w->devs_written); + if (bch2_mark_replicas(c, &replicas.e)) + err = -EIO; + } + + if (err) + bch2_fatal_error(c); + + spin_lock(&j->lock); + seq = le64_to_cpu(w->data->seq); + + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = w->devs_written; + + if (!err) { + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + + bch2_do_discards(c); + closure_wake_up(&c->freelist_wait); + + bch2_reset_alloc_cursors(c); + } + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; + + j->seq_ondisk = seq; + + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); + + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); + + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(journal_state_count(new, new.unwritten_idx)); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + bch2_journal_reclaim_fast(j); + bch2_journal_space_available(j); + + closure_wake_up(&w->wait); + journal_wake(j); + + if (!journal_state_count(new, new.unwritten_idx) && + journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { + spin_unlock(&j->lock); + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + struct journal_buf *buf = journal_cur_buf(j); + long delta = buf->expires - jiffies; + + /* + * We don't close a journal entry to write it while there's + * previous entries still in flight - the current journal entry + * might want to be written now: + */ + + spin_unlock(&j->lock); + mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); + } else { + spin_unlock(&j->lock); + } +} + +static void journal_write_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; + struct journal_buf *w = journal_last_unwritten_buf(j); + unsigned long flags; + + if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + "error writing journal entry %llu: %s", + le64_to_cpu(w->data->seq), + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { + spin_lock_irqsave(&j->err_lock, flags); + bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); + spin_unlock_irqrestore(&j->err_lock, flags); + } + + closure_put(&j->io); + percpu_ref_put(&ca->io_ref); +} + +static CLOSURE_CALLBACK(do_journal_write) +{ + closure_type(j, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_extent_ptr *ptr; + struct bio *bio; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + + BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); + ca->prev_journal_sector = bio->bi_iter.bi_sector; + + if (!JSET_NO_FLUSH(w->data)) + bio->bi_opf |= REQ_FUA; + if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) + bio->bi_opf |= REQ_PREFLUSH; + + bch2_bio_map(bio, w->data, sectors << 9); + + trace_and_count(c, journal_write, bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = + le64_to_cpu(w->data->seq); + } + + continue_at(cl, journal_write_done, c->io_complete_wq); +} + +static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset_entry *start, *end, *i, *next, *prev = NULL; + struct jset *jset = w->data; + unsigned sectors, bytes, u64s; + bool validate_before_checksum = false; + unsigned long btree_roots_have = 0; + int ret; + + /* + * Simple compaction, dropping empty jset_entries (from journal + * reservations that weren't fully used) and merging jset_entries that + * can be. + * + * If we wanted to be really fancy here, we could sort all the keys in + * the jset and drop keys that were overwritten - probably not worth it: + */ + vstruct_for_each_safe(jset, i, next) { + unsigned u64s = le16_to_cpu(i->u64s); + + /* Empty entry: */ + if (!u64s) + continue; + + /* + * New btree roots are set by journalling them; when the journal + * entry gets written we have to propagate them to + * c->btree_roots + * + * But, every journal entry we write has to contain all the + * btree roots (at least for now); so after we copy btree roots + * to c->btree_roots we have to get any missing btree roots and + * add them to this journal entry: + */ + if (i->type == BCH_JSET_ENTRY_btree_root) { + bch2_journal_entry_to_btree_root(c, i); + __set_bit(i->btree_id, &btree_roots_have); + } + + /* Can we merge with previous entry? */ + if (prev && + i->btree_id == prev->btree_id && + i->level == prev->level && + i->type == prev->type && + i->type == BCH_JSET_ENTRY_btree_keys && + le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { + memmove_u64s_down(vstruct_next(prev), + i->_data, + u64s); + le16_add_cpu(&prev->u64s, u64s); + continue; + } + + /* Couldn't merge, move i into new position (after prev): */ + prev = prev ? vstruct_next(prev) : jset->start; + if (i != prev) + memmove_u64s_down(prev, i, jset_u64s(u64s)); + } + + prev = prev ? vstruct_next(prev) : jset->start; + jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); + + start = end = vstruct_last(jset); + + end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); + + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); + u64s = (u64 *) end - (u64 *) start; + BUG_ON(u64s > j->entry_u64s_reserved); + + le32_add_cpu(&jset->u64s, u64s); + + sectors = vstruct_sectors(jset, c->block_bits); + bytes = vstruct_bytes(jset); + + if (sectors > w->sectors) { + bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", + vstruct_bytes(jset), w->sectors << 9, + u64s, w->u64s_reserved, j->entry_u64s_reserved); + return -EINVAL; + } + + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = cpu_to_le32(c->sb.version); + + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + + if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) + j->last_empty_seq = le64_to_cpu(jset->seq); + + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + + if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) + validate_before_checksum = true; + + if (validate_before_checksum && + (ret = jset_validate(c, NULL, jset, 0, WRITE))) + return ret; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret)) + return ret; + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); + + if (!validate_before_checksum && + (ret = jset_validate(c, NULL, jset, 0, WRITE))) + return ret; + + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); + return 0; +} + +static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + int error = bch2_journal_error(j); + + /* + * If the journal is in an error state - we did an emergency shutdown - + * we prefer to continue doing journal writes. We just mark them as + * noflush so they'll never be used, but they'll still be visible by the + * list_journal tool - this helps in debugging. + * + * There's a caveat: the first journal write after marking the + * superblock dirty must always be a flush write, because on startup + * from a clean shutdown we didn't necessarily read the journal and the + * new journal write might overwrite whatever was in the journal + * previously - we can't leave the journal without any flush writes in + * it. + * + * So if we're in an error state, and we're still starting up, we don't + * write anything at all. + */ + if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) + return -EIO; + + if (error || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + w->noflush = true; + SET_JSET_NO_FLUSH(w->data, true); + w->data->last_seq = 0; + w->last_seq = 0; + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + } + + return 0; +} + +CLOSURE_CALLBACK(bch2_journal_write) +{ + closure_type(j, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_replicas_padded replicas; + struct bio *bio; + struct printbuf journal_debug_buf = PRINTBUF; + unsigned i, nr_rw_members = 0; + int ret; + + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + j->write_start_time = local_clock(); + + spin_lock(&j->lock); + ret = bch2_journal_write_pick_flush(j, w); + spin_unlock(&j->lock); + if (ret) + goto err; + + journal_buf_realloc(j, w); + + ret = bch2_journal_write_prep(j, w); + if (ret) + goto err; + + while (1) { + spin_lock(&j->lock); + ret = journal_write_alloc(j, w); + if (!ret || !j->can_discard) + break; + + spin_unlock(&j->lock); + bch2_journal_do_discards(j); + } + + if (ret) { + __bch2_journal_debug_to_text(&journal_debug_buf, j); + spin_unlock(&j->lock); + bch_err(c, "Unable to allocate journal write:\n%s", + journal_debug_buf.buf); + printbuf_exit(&journal_debug_buf); + goto err; + } + + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): + */ + w->sectors = 0; + + /* + * journal entry has been compacted and allocated, recalculate space + * available: + */ + bch2_journal_space_available(j); + spin_unlock(&j->lock); + + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + + if (c->opts.nochanges) + goto no_io; + + for_each_rw_member(ca, c, i) + nr_rw_members++; + + if (nr_rw_members > 1) + w->separate_flush = true; + + /* + * Mark journal replicas before we submit the write to guarantee + * recovery will find the journal entries after a crash. + */ + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + w->devs_written); + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) + goto err; + + if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { + for_each_rw_member(ca, c, i) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + } + + continue_at(cl, do_journal_write, c->io_complete_wq); + return; +no_io: + continue_at(cl, journal_write_done, c->io_complete_wq); + return; +err: + bch2_fatal_error(c); + continue_at(cl, journal_write_done, c->io_complete_wq); +} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h new file mode 100644 index 000000000000..c035e7c108e1 --- /dev/null +++ b/fs/bcachefs/journal_io.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_IO_H +#define _BCACHEFS_JOURNAL_IO_H + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; + } ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + + bool csum_good; + bool ignore; + /* must be last: */ + struct jset j; +}; + +static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + struct jset_entry *entry, unsigned type) +{ + while (entry < vstruct_last(jset)) { + if (entry->type == type) + return entry; + + entry = vstruct_next(entry); + } + + return NULL; +} + +#define for_each_jset_entry_type(entry, jset, type) \ + for (entry = (jset)->start; \ + (entry = __jset_entry_type_next(jset, entry, type)); \ + entry = vstruct_next(entry)) + +#define jset_entry_for_each_key(_e, _k) \ + for (_k = (_e)->start; \ + _k < vstruct_last(_e); \ + _k = bkey_next(_k)) + +#define for_each_jset_key(k, entry, jset) \ + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\ + jset_entry_for_each_key(entry, k) + +int bch2_journal_entry_validate(struct bch_fs *, struct jset *, + struct jset_entry *, unsigned, int, + enum bkey_invalid_flags); +void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + struct jset_entry *); + +void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct journal_replay *); + +int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); + +CLOSURE_CALLBACK(bch2_journal_write); + +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 index 000000000000..ec712104addb --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "buckets.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "replicas.h" +#include "sb-members.h" +#include "trace.h" + +#include <linux/kthread.h> +#include <linux/sched/mm.h> + +/* Free space calculations: */ + +static unsigned journal_space_from(struct journal_device *ja, + enum journal_space_from from) +{ + switch (from) { + case journal_space_discarded: + return ja->discard_idx; + case journal_space_clean_ondisk: + return ja->dirty_idx_ondisk; + case journal_space_clean: + return ja->dirty_idx; + default: + BUG(); + } +} + +unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja, + enum journal_space_from from) +{ + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; + + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: + */ + if (available && ja->dirty_idx_ondisk == ja->dirty_idx) + --available; + + return available; +} + +static inline void journal_set_watermark(struct journal *j, bool low_on_space) +{ + unsigned watermark = BCH_WATERMARK_stripe; + + if (low_on_space) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); +} + +static struct journal_space +journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) +{ + struct journal_device *ja = &ca->journal; + unsigned sectors, buckets, unwritten; + u64 seq; + + if (from == journal_space_total) + return (struct journal_space) { + .next_entry = ca->mi.bucket_size, + .total = ca->mi.bucket_size * ja->nr, + }; + + buckets = bch2_journal_dev_buckets_available(j, ja, from); + sectors = ja->sectors_free; + + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; + + if (!unwritten) + continue; + + /* entry won't fit on this device, skip: */ + if (unwritten > ca->mi.bucket_size) + continue; + + if (unwritten >= sectors) { + if (!buckets) { + sectors = 0; + break; + } + + buckets--; + sectors = ca->mi.bucket_size; + } + + sectors -= unwritten; + } + + if (sectors < ca->mi.bucket_size && buckets) { + buckets--; + sectors = ca->mi.bucket_size; + } + + return (struct journal_space) { + .next_entry = sectors, + .total = sectors + buckets * ca->mi.bucket_size, + }; +} + +static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, + enum journal_space_from from) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned i, pos, nr_devs = 0; + struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + + BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + if (!ca->journal.nr) + continue; + + space = journal_dev_space_available(j, ca, from); + if (!space.next_entry) + continue; + + for (pos = 0; pos < nr_devs; pos++) + if (space.total > dev_space[pos].total) + break; + + array_insert_item(dev_space, nr_devs, pos, space); + } + rcu_read_unlock(); + + if (nr_devs < nr_devs_want) + return (struct journal_space) { 0, 0 }; + + /* + * We sorted largest to smallest, and we want the smallest out of the + * @nr_devs_want largest devices: + */ + return dev_space[nr_devs_want - 1]; +} + +void bch2_journal_space_available(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned clean, clean_ondisk, total; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); + unsigned i, nr_online = 0, nr_devs_want; + bool can_discard = false; + int ret = 0; + + lockdep_assert_held(&j->lock); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) + continue; + + while (ja->dirty_idx != ja->cur_idx && + ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + + while (ja->dirty_idx_ondisk != ja->dirty_idx && + ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; + + if (ja->discard_idx != ja->dirty_idx_ondisk) + can_discard = true; + + max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); + nr_online++; + } + rcu_read_unlock(); + + j->can_discard = can_discard; + + if (nr_online < c->opts.metadata_replicas_required) { + ret = JOURNAL_ERR_insufficient_devices; + goto out; + } + + nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); + + for (i = 0; i < journal_space_nr; i++) + j->space[i] = __journal_space_available(j, nr_devs_want, i); + + clean_ondisk = j->space[journal_space_clean_ondisk].total; + clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; + + if (!j->space[journal_space_discarded].next_entry) + ret = JOURNAL_ERR_journal_full; + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && + (clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean)) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + + journal_set_watermark(j, clean * 4 <= total); +out: + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_error = ret; + + if (!ret) + journal_wake(j); +} + +/* Discards - last part of journal reclaim: */ + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + bool ret; + + spin_lock(&j->lock); + ret = ja->discard_idx != ja->dirty_idx_ondisk; + spin_unlock(&j->lock); + + return ret; +} + +/* + * Advance ja->discard_idx as long as it points to buckets that are no longer + * dirty, issuing discards if necessary: + */ +void bch2_journal_do_discards(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned iter; + + mutex_lock(&j->discard_lock); + + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { + if (!c->opts.nochanges && + ca->mi.discard && + bdev_max_discard_sectors(ca->disk_sb.bdev)) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->discard_idx]), + ca->mi.bucket_size, GFP_NOFS); + + spin_lock(&j->lock); + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; + + bch2_journal_space_available(j); + spin_unlock(&j->lock); + } + } + + mutex_unlock(&j->discard_lock); +} + +/* + * Journal entry pinning - machinery for holding a reference on a given journal + * entry, holding it open to ensure it gets replayed during recovery: + */ + +void bch2_journal_reclaim_fast(struct journal *j) +{ + bool popped = false; + + lockdep_assert_held(&j->lock); + + /* + * Unpin journal entries whose reference counts reached zero, meaning + * all btree nodes got written out + */ + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { + j->pin.front++; + popped = true; + } + + if (popped) + bch2_journal_space_available(j); +} + +bool __bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + return atomic_dec_and_test(&pin_list->count); +} + +void bch2_journal_pin_put(struct journal *j, u64 seq) +{ + if (__bch2_journal_pin_put(j, seq)) { + spin_lock(&j->lock); + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); + } +} + +static inline bool __journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + struct journal_entry_pin_list *pin_list; + + if (!journal_pin_active(pin)) + return false; + + if (j->flush_in_progress == pin) + j->flush_in_progress_dropped = true; + + pin_list = journal_seq_pin(j, pin->seq); + pin->seq = 0; + list_del_init(&pin->list); + + /* + * Unpinning a journal entry may make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: + */ + return atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin); +} + +void bch2_journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + spin_lock(&j->lock); + if (__journal_pin_drop(j, pin)) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); +} + +static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +{ + if (fn == bch2_btree_node_flush0 || + fn == bch2_btree_node_flush1) + return JOURNAL_PIN_btree; + else if (fn == bch2_btree_key_cache_journal_flush) + return JOURNAL_PIN_key_cache; + else + return JOURNAL_PIN_other; +} + +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + struct journal_entry_pin_list *pin_list; + bool reclaim; + + spin_lock(&j->lock); + + if (seq < journal_last_seq(j)) { + /* + * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on + * the src pin - with the pin dropped, the entry to pin might no + * longer to exist, but that means there's no longer anything to + * copy and we can bail out here: + */ + spin_unlock(&j->lock); + return; + } + + pin_list = journal_seq_pin(j, seq); + + reclaim = __journal_pin_drop(j, pin); + + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + + if (flush_fn) + list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); + else + list_add(&pin->list, &pin_list->flushed); + + if (reclaim) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); + + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); +} + +/** + * bch2_journal_pin_flush: ensure journal pin callback is no longer running + * @j: journal object + * @pin: pin to flush + */ +void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) +{ + BUG_ON(journal_pin_active(pin)); + + wait_event(j->pin_flush_wait, j->flush_in_progress != pin); +} + +/* + * Journal reclaim: flush references to open journal entries to reclaim space in + * the journal + * + * May be done by the journal code in the background as needed to free up space + * for more journal entries, or as part of doing a clean shutdown, or to migrate + * data off of a specific device: + */ + +static struct journal_entry_pin * +journal_get_next_pin(struct journal *j, + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, + u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; + unsigned i; + + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { + if (*seq > seq_to_flush && !allowed_above_seq) + break; + + for (i = 0; i < JOURNAL_PIN_NR; i++) + if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || + ((1U << i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->list[i], + struct journal_entry_pin, list); + if (ret) + return ret; + } + } + + return NULL; +} + +/* returns true if we did work */ +static size_t journal_flush_pins(struct journal *j, + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, + unsigned min_any, + unsigned min_key_cache) +{ + struct journal_entry_pin *pin; + size_t nr_flushed = 0; + journal_pin_flush_fn flush_fn; + u64 seq; + int err; + + lockdep_assert_held(&j->reclaim_lock); + + while (1) { + unsigned allowed_above = allowed_above_seq; + unsigned allowed_below = allowed_below_seq; + + if (min_any) { + allowed_above |= ~0; + allowed_below |= ~0; + } + + if (min_key_cache) { + allowed_above |= 1U << JOURNAL_PIN_key_cache; + allowed_below |= 1U << JOURNAL_PIN_key_cache; + } + + cond_resched(); + + j->last_flushed = jiffies; + + spin_lock(&j->lock); + pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; + j->flush_in_progress_dropped = false; + flush_fn = pin->flush; + } + spin_unlock(&j->lock); + + if (!pin) + break; + + if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) + min_key_cache--; + + if (min_any) + min_any--; + + err = flush_fn(j, pin, seq); + + spin_lock(&j->lock); + /* Pin might have been dropped or rearmed: */ + if (likely(!err && !j->flush_in_progress_dropped)) + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + j->flush_in_progress = NULL; + j->flush_in_progress_dropped = false; + spin_unlock(&j->lock); + + wake_up(&j->pin_flush_wait); + + if (err) + break; + + nr_flushed++; + } + + return nr_flushed; +} + +static u64 journal_seq_to_flush(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + u64 seq_to_flush = 0; + unsigned iter; + + spin_lock(&j->lock); + + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + unsigned nr_buckets, bucket_to_flush; + + if (!ja->nr) + continue; + + /* Try to keep the journal at most half full: */ + nr_buckets = ja->nr / 2; + + nr_buckets = min(nr_buckets, ja->nr); + + bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; + seq_to_flush = max(seq_to_flush, + ja->bucket_seq[bucket_to_flush]); + } + + /* Also flush if the pin fifo is more than half full */ + seq_to_flush = max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); + spin_unlock(&j->lock); + + return seq_to_flush; +} + +/** + * __bch2_journal_reclaim - free up journal buckets + * @j: journal object + * @direct: direct or background reclaim? + * @kicked: requested to run since we last ran? + * Returns: 0 on success, or -EIO if the journal has been shutdown + * + * Background journal reclaim writes out btree nodes. It should be run + * early enough so that we never completely run out of journal buckets. + * + * High watermarks for triggering background reclaim: + * - FIFO has fewer than 512 entries left + * - fewer than 25% journal buckets free + * + * Background reclaim runs until low watermarks are reached: + * - FIFO has more than 1024 entries left + * - more than 50% journal buckets free + * + * As long as a reclaim can complete in the time it takes to fill up + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush; + size_t min_nr, min_key_cache, nr_flushed; + unsigned flags; + int ret = 0; + + /* + * We can't invoke memory reclaim while holding the reclaim_lock - + * journal reclaim is required to make progress for memory reclaim + * (cleaning the caches), so we can't get stuck in memory reclaim while + * we're holding the reclaim lock: + */ + lockdep_assert_held(&j->reclaim_lock); + flags = memalloc_noreclaim_save(); + + do { + if (kthread && kthread_should_stop()) + break; + + if (bch2_journal_error(j)) { + ret = -EIO; + break; + } + + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); + min_nr = 0; + + /* + * If it's been longer than j->reclaim_delay_ms since we last flushed, + * make sure to flush at least one journal pin: + */ + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(c->opts.journal_reclaim_delay))) + min_nr = 1; + + if (j->watermark != BCH_WATERMARK_stripe) + min_nr = 1; + + if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + min_nr = 1; + + min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); + + trace_and_count(c, journal_reclaim_start, c, + direct, kicked, + min_nr, min_key_cache, + atomic_read(&c->btree_cache.dirty), + c->btree_cache.used, + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); + + nr_flushed = journal_flush_pins(j, seq_to_flush, + ~0, 0, + min_nr, min_key_cache); + + if (direct) + j->nr_direct_reclaim += nr_flushed; + else + j->nr_background_reclaim += nr_flushed; + trace_and_count(c, journal_reclaim_finish, c, nr_flushed); + + if (nr_flushed) + wake_up(&j->reclaim_wait); + } while ((min_nr || min_key_cache) && nr_flushed && !direct); + + memalloc_noreclaim_restore(flags); + + return ret; +} + +int bch2_journal_reclaim(struct journal *j) +{ + return __bch2_journal_reclaim(j, true, true); +} + +static int bch2_journal_reclaim_thread(void *arg) +{ + struct journal *j = arg; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned long delay, now; + bool journal_empty; + int ret = 0; + + set_freezable(); + + j->last_flushed = jiffies; + + while (!ret && !kthread_should_stop()) { + bool kicked = j->reclaim_kicked; + + j->reclaim_kicked = false; + + mutex_lock(&j->reclaim_lock); + ret = __bch2_journal_reclaim(j, false, kicked); + mutex_unlock(&j->reclaim_lock); + + now = jiffies; + delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); + j->next_reclaim = j->last_flushed + delay; + + if (!time_in_range(j->next_reclaim, now, now + delay)) + j->next_reclaim = now + delay; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + if (kthread_should_stop()) + break; + if (j->reclaim_kicked) + break; + + spin_lock(&j->lock); + journal_empty = fifo_empty(&j->pin); + spin_unlock(&j->lock); + + if (journal_empty) + schedule(); + else if (time_after(j->next_reclaim, jiffies)) + schedule_timeout(j->next_reclaim - jiffies); + else + break; + } + __set_current_state(TASK_RUNNING); + } + + return 0; +} + +void bch2_journal_reclaim_stop(struct journal *j) +{ + struct task_struct *p = j->reclaim_thread; + + j->reclaim_thread = NULL; + + if (p) { + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_journal_reclaim_start(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct task_struct *p; + int ret; + + if (j->reclaim_thread) + return 0; + + p = kthread_create(bch2_journal_reclaim_thread, j, + "bch-reclaim/%s", c->name); + ret = PTR_ERR_OR_ZERO(p); + if (ret) { + bch_err_msg(c, ret, "creating journal reclaim thread"); + return ret; + } + + get_task_struct(p); + j->reclaim_thread = p; + wake_up_process(p); + return 0; +} + +static int journal_flush_done(struct journal *j, u64 seq_to_flush, + bool *did_work) +{ + int ret; + + ret = bch2_journal_error(j); + if (ret) + return ret; + + mutex_lock(&j->reclaim_lock); + + if (journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_key_cache)| + (1U << JOURNAL_PIN_other), 0, 0, 0) || + journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_btree), 0, 0, 0)) + *did_work = true; + + if (seq_to_flush > journal_cur_seq(j)) + bch2_journal_entry_close(j); + + spin_lock(&j->lock); + /* + * If journal replay hasn't completed, the unreplayed journal entries + * hold refs on their corresponding sequence numbers + */ + ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + journal_last_seq(j) > seq_to_flush || + !fifo_used(&j->pin); + + spin_unlock(&j->lock); + mutex_unlock(&j->reclaim_lock); + + return ret; +} + +bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) +{ + bool did_work = false; + + if (!test_bit(JOURNAL_STARTED, &j->flags)) + return false; + + closure_wait_event(&j->async_wait, + journal_flush_done(j, seq_to_flush, &did_work)); + + return did_work; +} + +int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + u64 iter, seq = 0; + int ret = 0; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(p, &j->pin, iter) + if (dev_idx >= 0 + ? bch2_dev_list_has_dev(p->devs, dev_idx) + : p->devs.nr < c->opts.metadata_replicas) + seq = iter; + spin_unlock(&j->lock); + + bch2_journal_flush_pins(j, seq); + + ret = bch2_journal_error(j); + if (ret) + return ret; + + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); + + /* + * Now that we've populated replicas_gc, write to the journal to mark + * active journal devices. This handles the case where the journal might + * be empty. Otherwise we could clear all journal replicas and + * temporarily put the fs into an unrecoverable state. Journal recovery + * expects to find devices marked for journal data on unclean mount. + */ + ret = bch2_journal_meta(&c->journal); + if (ret) + goto err; + + seq = 0; + spin_lock(&j->lock); + while (!ret) { + struct bch_replicas_padded replicas; + + seq = max(seq, journal_last_seq(j)); + if (seq >= j->pin.back) + break; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + journal_seq_pin(j, seq)->devs); + seq++; + + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } + spin_unlock(&j->lock); +err: + ret = bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + + return ret; +} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h new file mode 100644 index 000000000000..494d1a6eddb0 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_RECLAIM_H +#define _BCACHEFS_JOURNAL_RECLAIM_H + +#define JOURNAL_PIN (32 * 1024) + +static inline void journal_reclaim_kick(struct journal *j) +{ + struct task_struct *p = READ_ONCE(j->reclaim_thread); + + j->reclaim_kicked = true; + if (p) + wake_up_process(p); +} + +unsigned bch2_journal_dev_buckets_available(struct journal *, + struct journal_device *, + enum journal_space_from); +void bch2_journal_space_available(struct journal *); + +static inline bool journal_pin_active(struct journal_entry_pin *pin) +{ + return pin->seq != 0; +} + +static inline struct journal_entry_pin_list * +journal_seq_pin(struct journal *j, u64 seq) +{ + EBUG_ON(seq < j->pin.front || seq >= j->pin.back); + + return &j->pin.data[seq & j->pin.mask]; +} + +void bch2_journal_reclaim_fast(struct journal *); +bool __bch2_journal_pin_put(struct journal *, u64); +void bch2_journal_pin_put(struct journal *, u64); +void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); + +void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); + +static inline void bch2_journal_pin_add(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) + bch2_journal_pin_set(j, seq, pin, flush_fn); +} + +static inline void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) +{ + /* Guard against racing with journal_pin_drop(src): */ + u64 seq = READ_ONCE(src->seq); + + if (seq) + bch2_journal_pin_add(j, seq, dst, flush_fn); +} + +static inline void bch2_journal_pin_update(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) + bch2_journal_pin_set(j, seq, pin, flush_fn); +} + +void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); + +void bch2_journal_do_discards(struct journal *); +int bch2_journal_reclaim(struct journal *); + +void bch2_journal_reclaim_stop(struct journal *); +int bch2_journal_reclaim_start(struct journal *); + +bool bch2_journal_flush_pins(struct journal *, u64); + +static inline bool bch2_journal_flush_all_pins(struct journal *j) +{ + return bch2_journal_flush_pins(j, U64_MAX); +} + +int bch2_journal_flush_device_pins(struct journal *, int); + +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c new file mode 100644 index 000000000000..ae4fb8c3a2bc --- /dev/null +++ b/fs/bcachefs/journal_sb.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "journal_sb.h" +#include "darray.h" + +#include <linux/sort.h> + +/* BCH_SB_FIELD_journal: */ + +static int u64_cmp(const void *_l, const void *_r) +{ + const u64 *l = _l; + const u64 *r = _r; + + return cmp_int(*l, *r); +} + +static int bch2_sb_journal_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); + int ret = -BCH_ERR_invalid_sb_journal; + unsigned nr; + unsigned i; + u64 *b; + + nr = bch2_nr_journal_buckets(journal); + if (!nr) + return 0; + + b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); + if (!b) + return -BCH_ERR_ENOMEM_sb_journal_validate; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + if (!b[0]) { + prt_printf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0] < le16_to_cpu(m.first_bucket)) { + prt_printf(err, "journal bucket %llu before first bucket %u", + b[0], le16_to_cpu(m.first_bucket)); + goto err; + } + + if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) { + prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1], le64_to_cpu(m.nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) { + prt_printf(err, "duplicate journal buckets %llu", b[i]); + goto err; + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + unsigned i, nr = bch2_nr_journal_buckets(journal); + + prt_printf(out, "Buckets: "); + for (i = 0; i < nr; i++) + prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal = { + .validate = bch2_sb_journal_validate, + .to_text = bch2_sb_journal_to_text, +}; + +struct u64_range { + u64 start; + u64 end; +}; + +static int u64_range_cmp(const void *_l, const void *_r) +{ + const struct u64_range *l = _l; + const struct u64_range *r = _r; + + return cmp_int(l->start, r->start); +} + +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); + int ret = -BCH_ERR_invalid_sb_journal; + unsigned nr; + unsigned i; + struct u64_range *b; + + nr = bch2_sb_field_journal_v2_nr_entries(journal); + if (!nr) + return 0; + + b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); + if (!b) + return -BCH_ERR_ENOMEM_sb_journal_v2_validate; + + for (i = 0; i < nr; i++) { + b[i].start = le64_to_cpu(journal->d[i].start); + b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + } + + sort(b, nr, sizeof(*b), u64_range_cmp, NULL); + + if (!b[0].start) { + prt_printf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0].start < le16_to_cpu(m.first_bucket)) { + prt_printf(err, "journal bucket %llu before first bucket %u", + b[0].start, le16_to_cpu(m.first_bucket)); + goto err; + } + + if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) { + prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1].end - 1, le64_to_cpu(m.nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) { + if (b[i].end > b[i + 1].start) { + prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", + b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + goto err; + } + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); + + prt_printf(out, "Buckets: "); + for (i = 0; i < nr; i++) + prt_printf(out, " %llu-%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { + .validate = bch2_sb_journal_v2_validate, + .to_text = bch2_sb_journal_v2_to_text, +}; + +int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, + u64 *buckets, unsigned nr) +{ + struct bch_sb_field_journal_v2 *j; + unsigned i, dst = 0, nr_compacted = 1; + + if (c) + lockdep_assert_held(&c->sb_lock); + + if (!nr) { + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); + return 0; + } + + for (i = 0; i + 1 < nr; i++) + if (buckets[i] + 1 != buckets[i + 1]) + nr_compacted++; + + j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, + (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); + if (!j) + return -BCH_ERR_ENOSPC_sb_journal; + + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + + j->d[dst].start = cpu_to_le64(buckets[0]); + j->d[dst].nr = cpu_to_le64(1); + + for (i = 1; i < nr; i++) { + if (buckets[i] == buckets[i - 1] + 1) { + le64_add_cpu(&j->d[dst].nr, 1); + } else { + dst++; + j->d[dst].start = cpu_to_le64(buckets[i]); + j->d[dst].nr = cpu_to_le64(1); + } + } + + BUG_ON(dst + 1 != nr_compacted); + return 0; +} diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h new file mode 100644 index 000000000000..ba40a7e8d90a --- /dev/null +++ b/fs/bcachefs/journal_sb.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "super-io.h" +#include "vstructs.h" + +static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) +{ + return j + ? (__le64 *) vstruct_end(&j->field) - j->buckets + : 0; +} + +static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) +{ + if (!j) + return 0; + + return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal; +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; + +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c new file mode 100644 index 000000000000..f9d9aa95bf3a --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_iter.h" +#include "eytzinger.h" +#include "journal_seq_blacklist.h" +#include "super-io.h" + +/* + * journal_seq_blacklist machinery: + * + * To guarantee order of btree updates after a crash, we need to detect when a + * btree node entry (bset) is newer than the newest journal entry that was + * successfully written, and ignore it - effectively ignoring any btree updates + * that didn't make it into the journal. + * + * If we didn't do this, we might have two btree nodes, a and b, both with + * updates that weren't written to the journal yet: if b was updated after a, + * but b was flushed and not a - oops; on recovery we'll find that the updates + * to b happened, but not the updates to a that happened before it. + * + * Ignoring bsets that are newer than the newest journal entry is always safe, + * because everything they contain will also have been journalled - and must + * still be present in the journal on disk until a journal entry has been + * written _after_ that bset was written. + * + * To accomplish this, bsets record the newest journal sequence number they + * contain updates for; then, on startup, the btree code queries the journal + * code to ask "Is this sequence number newer than the newest journal entry? If + * so, ignore it." + * + * When this happens, we must blacklist that journal sequence number: the + * journal must not write any entries with that sequence number, and it must + * record that it was blacklisted so that a) on recovery we don't think we have + * missing journal entries and b) so that the btree code continues to ignore + * that bset, until that btree node is rewritten. + */ + +static unsigned sb_blacklist_u64s(unsigned nr) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + + return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); +} + +static struct bch_sb_field_journal_seq_blacklist * +blacklist_entry_try_merge(struct bch_fs *c, + struct bch_sb_field_journal_seq_blacklist *bl, + unsigned i) +{ + unsigned nr = blacklist_nr_entries(bl); + + if (le64_to_cpu(bl->start[i].end) >= + le64_to_cpu(bl->start[i + 1].start)) { + bl->start[i].end = bl->start[i + 1].end; + --nr; + memmove(&bl->start[i], + &bl->start[i + 1], + sizeof(bl->start[0]) * (nr - i)); + + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + sb_blacklist_u64s(nr)); + BUG_ON(!bl); + } + + return bl; +} + +static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, + u64 start, u64 end) +{ + return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); +} + +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + unsigned i, nr; + int ret = 0; + + mutex_lock(&c->sb_lock); + bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + nr = blacklist_nr_entries(bl); + + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; + + if (bl_entry_contig_or_overlaps(e, start, end)) { + e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); + e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; + } + } + + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + sb_blacklist_u64s(nr + 1)); + if (!bl) { + ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; + goto out; + } + + bl->start[nr].start = cpu_to_le64(start); + bl->start[nr].end = cpu_to_le64(end); +out_write_sb: + c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); + + ret = bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); + + return ret ?: bch2_blacklist_table_initialize(c); +} + +static int journal_seq_blacklist_table_cmp(const void *_l, + const void *_r, size_t size) +{ + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; + + return cmp_int(l->start, r->start); +} + +bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + bool dirty) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx; + + if (!t) + return false; + + idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0) + return false; + + BUG_ON(t->entries[idx].start > seq); + + if (seq >= t->entries[idx].end) + return false; + + if (dirty) + t->entries[idx].dirty = true; + return true; +} + +int bch2_blacklist_table_initialize(struct bch_fs *c) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + struct journal_seq_blacklist_table *t; + unsigned i, nr = blacklist_nr_entries(bl); + + if (!bl) + return 0; + + t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, + GFP_KERNEL); + if (!t) + return -BCH_ERR_ENOMEM_blacklist_table_init; + + t->nr = nr; + + for (i = 0; i < nr; i++) { + t->entries[i].start = le64_to_cpu(bl->start[i].start); + t->entries[i].end = le64_to_cpu(bl->start[i].end); + } + + eytzinger0_sort(t->entries, + t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + NULL); + + kfree(c->journal_seq_blacklist_table); + c->journal_seq_blacklist_table = t; + return 0; +} + +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + unsigned i, nr = blacklist_nr_entries(bl); + + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = bl->start + i; + + if (le64_to_cpu(e->start) >= + le64_to_cpu(e->end)) { + prt_printf(err, "entry %u start >= end (%llu >= %llu)", + i, le64_to_cpu(e->start), le64_to_cpu(e->end)); + return -BCH_ERR_invalid_sb_journal_seq_blacklist; + } + + if (i + 1 < nr && + le64_to_cpu(e[0].end) > + le64_to_cpu(e[1].start)) { + prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", + i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); + return -BCH_ERR_invalid_sb_journal_seq_blacklist; + } + } + + return 0; +} + +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (i != bl->start) + prt_printf(out, " "); + + prt_printf(out, "%llu-%llu", + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text +}; + +void bch2_blacklist_entries_gc(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + journal_seq_blacklist_gc_work); + struct journal_seq_blacklist_table *t; + struct bch_sb_field_journal_seq_blacklist *bl; + struct journal_seq_blacklist_entry *src, *dst; + struct btree_trans *trans = bch2_trans_get(c); + unsigned i, nr, new_nr; + int ret; + + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_iter iter; + struct btree *b; + + bch2_trans_node_iter_init(trans, &iter, i, POS_MIN, + 0, 0, BTREE_ITER_PREFETCH); +retry: + bch2_trans_begin(trans); + + b = bch2_btree_iter_peek_node(&iter); + + while (!(ret = PTR_ERR_OR_ZERO(b)) && + b && + !test_bit(BCH_FS_STOPPING, &c->flags)) + b = bch2_btree_iter_next_node(&iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(trans, &iter); + } + + bch2_trans_put(trans); + if (ret) + return; + + mutex_lock(&c->sb_lock); + bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + if (!bl) + goto out; + + nr = blacklist_nr_entries(bl); + dst = bl->start; + + t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); + + for (src = bl->start, i = eytzinger0_first(t->nr); + src < bl->start + nr; + src++, i = eytzinger0_next(i, nr)) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty) + *dst++ = *src; + } + + new_nr = dst - bl->start; + + bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); + + if (new_nr != nr) { + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + + if (!new_nr) + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + + bch2_write_super(c); + } +out: + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h new file mode 100644 index 000000000000..afb886ec8e25 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H + +static inline unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) +{ + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; +} + +bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); +int bch2_blacklist_table_initialize(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; + +void bch2_blacklist_entries_gc(struct work_struct *); + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h new file mode 100644 index 000000000000..a756b69582e3 --- /dev/null +++ b/fs/bcachefs/journal_types.h @@ -0,0 +1,319 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_TYPES_H +#define _BCACHEFS_JOURNAL_TYPES_H + +#include <linux/cache.h> +#include <linux/workqueue.h> + +#include "alloc_types.h" +#include "super_types.h" +#include "fifo.h" + +#define JOURNAL_BUF_BITS 2 +#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) +#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) + +/* + * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to + * the journal that are being staged or in flight. + */ +struct journal_buf { + struct jset *data; + + __BKEY_PADDED(key, BCH_REPLICAS_MAX); + struct bch_devs_list devs_written; + + struct closure_waitlist wait; + u64 last_seq; /* copy of data->last_seq */ + long expires; + u64 flush_time; + + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ + unsigned disk_sectors; /* maximum size entry could have been, if + buf_size was bigger */ + unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ + bool separate_flush; +}; + +/* + * Something that makes a journal entry dirty - i.e. a btree node that has to be + * flushed: + */ + +enum journal_pin_type { + JOURNAL_PIN_btree, + JOURNAL_PIN_key_cache, + JOURNAL_PIN_other, + JOURNAL_PIN_NR, +}; + +struct journal_entry_pin_list { + struct list_head list[JOURNAL_PIN_NR]; + struct list_head flushed; + atomic_t count; + struct bch_devs_list devs; +}; + +struct journal; +struct journal_entry_pin; +typedef int (*journal_pin_flush_fn)(struct journal *j, + struct journal_entry_pin *, u64); + +struct journal_entry_pin { + struct list_head list; + journal_pin_flush_fn flush; + u64 seq; +}; + +struct journal_res { + bool ref; + u8 idx; + u16 u64s; + u32 offset; + u64 seq; +}; + +union journal_res_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 cur_entry_offset:20, + idx:2, + unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, + buf3_count:10; + }; +}; + +/* bytes: */ +#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ + +/* + * We stash some journal state as sentinal values in cur_entry_offset: + * note - cur_entry_offset is in units of u64s + */ +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) + +#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) +#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) + +struct journal_space { + /* Units of 512 bytes sectors: */ + unsigned next_entry; /* How big the next journal entry can be */ + unsigned total; +}; + +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, + journal_space_total, + journal_space_nr, +}; + +enum journal_flags { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, + JOURNAL_MAY_SKIP_FLUSH, + JOURNAL_NEED_FLUSH_WRITE, +}; + +/* Reasons we may fail to get a journal reservation: */ +#define JOURNAL_ERRORS() \ + x(ok) \ + x(blocked) \ + x(max_in_flight) \ + x(journal_full) \ + x(journal_pin_full) \ + x(journal_stuck) \ + x(insufficient_devices) + +enum journal_errors { +#define x(n) JOURNAL_ERR_##n, + JOURNAL_ERRORS() +#undef x +}; + +typedef DARRAY(u64) darray_u64; + +/* Embedded in struct bch_fs */ +struct journal { + /* Fastpath stuff up front: */ + struct { + + union journal_res_state reservations; + enum bch_watermark watermark; + + } __aligned(SMP_CACHE_BYTES); + + unsigned long flags; + + /* Max size of current journal entry */ + unsigned cur_entry_u64s; + unsigned cur_entry_sectors; + + /* Reserved space in journal entry to be used just prior to write */ + unsigned entry_u64s_reserved; + + + /* + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ + enum journal_errors cur_entry_error; + + unsigned buf_size_want; + /* + * We may queue up some things to be journalled (log messages) before + * the journal has actually started - stash them here: + */ + darray_u64 early_journal_entries; + + /* + * Two journal entries -- one is currently open for new entries, the + * other is possibly being written out. + */ + struct journal_buf buf[JOURNAL_BUF_NR]; + + spinlock_t lock; + + /* if nonzero, we may not open a new journal entry: */ + unsigned blocked; + + /* Used when waiting because the journal was full */ + wait_queue_head_t wait; + struct closure_waitlist async_wait; + struct closure_waitlist preres_wait; + + struct closure io; + struct delayed_work write_work; + + /* Sequence number of most recent journal entry (last entry in @pin) */ + atomic64_t seq; + + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; + u64 flushed_seq_ondisk; + u64 last_seq_ondisk; + u64 err_seq; + u64 last_empty_seq; + + /* + * FIFO of journal entries whose btree updates have not yet been + * written out. + * + * Each entry is a reference count. The position in the FIFO is the + * entry's sequence number relative to @seq. + * + * The journal entry itself holds a reference count, put when the + * journal entry is written out. Each btree node modified by the journal + * entry also holds a reference count, put when the btree node is + * written. + * + * When a reference count reaches zero, the journal entry is no longer + * needed. When all journal entries in the oldest journal bucket are no + * longer needed, the bucket can be discarded and reused. + */ + struct { + u64 front, back, size, mask; + struct journal_entry_pin_list *data; + } pin; + + struct journal_space space[journal_space_nr]; + + u64 replay_journal_seq; + u64 replay_journal_seq_end; + + struct write_point wp; + spinlock_t err_lock; + + struct mutex reclaim_lock; + /* + * Used for waiting until journal reclaim has freed up space in the + * journal: + */ + wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; + unsigned long next_reclaim; + u64 nr_direct_reclaim; + u64 nr_background_reclaim; + + unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; + bool flush_in_progress_dropped; + wait_queue_head_t pin_flush_wait; + + /* protects advancing ja->discard_idx: */ + struct mutex discard_lock; + bool can_discard; + + unsigned long last_flush_write; + + u64 res_get_blocked_start; + u64 write_start_time; + + u64 nr_flush_writes; + u64 nr_noflush_writes; + + struct bch2_time_stats *flush_write_time; + struct bch2_time_stats *noflush_write_time; + struct bch2_time_stats *blocked_time; + struct bch2_time_stats *flush_seq_time; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map res_map; +#endif +} __aligned(SMP_CACHE_BYTES); + +/* + * Embedded in struct bch_dev. First three fields refer to the array of journal + * buckets, in bch_sb. + */ +struct journal_device { + /* + * For each journal bucket, contains the max sequence number of the + * journal writes it contains - so we know when a bucket can be reused. + */ + u64 *bucket_seq; + + unsigned sectors_free; + + /* + * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: + */ + unsigned discard_idx; /* Next bucket to discard */ + unsigned dirty_idx_ondisk; + unsigned dirty_idx; + unsigned cur_idx; /* Journal bucket we're currently writing to */ + unsigned nr; + + u64 *buckets; + + /* Bio for journal reads/writes to this device */ + struct bio *bio; + + /* for bch_journal_read_device */ + struct closure read; +}; + +/* + * journal_entry_res - reserve space in every journal entry: + */ +struct journal_entry_res { + unsigned u64s; +}; + +#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c new file mode 100644 index 000000000000..5699cd4873c8 --- /dev/null +++ b/fs/bcachefs/keylist.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey.h" +#include "keylist.h" + +int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, + size_t nr_inline_u64s, size_t new_u64s) +{ + size_t oldsize = bch2_keylist_u64s(l); + size_t newsize = oldsize + new_u64s; + u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; + u64 *new_keys; + + newsize = roundup_pow_of_two(newsize); + + if (newsize <= nr_inline_u64s || + (old_buf && roundup_pow_of_two(oldsize) == newsize)) + return 0; + + new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS); + if (!new_keys) + return -ENOMEM; + + if (!old_buf) + memcpy_u64s(new_keys, inline_u64s, oldsize); + + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; + + return 0; +} + +void bch2_keylist_pop_front(struct keylist *l) +{ + l->top_p -= bch2_keylist_front(l)->k.u64s; + + memmove_u64s_down(l->keys, + bkey_next(l->keys), + bch2_keylist_u64s(l)); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_verify_keylist_sorted(struct keylist *l) +{ + struct bkey_i *k; + + for_each_keylist_key(l, k) + BUG_ON(bkey_next(k) != l->top && + bpos_ge(k->k.p, bkey_next(k)->k.p)); +} +#endif diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h new file mode 100644 index 000000000000..fe759c7031e0 --- /dev/null +++ b/fs/bcachefs/keylist.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_KEYLIST_H +#define _BCACHEFS_KEYLIST_H + +#include "keylist_types.h" + +int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); +void bch2_keylist_pop_front(struct keylist *); + +static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) +{ + l->top_p = l->keys_p = inline_keys; +} + +static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) +{ + if (l->keys_p != inline_keys) + kfree(l->keys_p); +} + +static inline void bch2_keylist_push(struct keylist *l) +{ + l->top = bkey_next(l->top); +} + +static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) +{ + bkey_copy(l->top, k); + bch2_keylist_push(l); +} + +static inline bool bch2_keylist_empty(struct keylist *l) +{ + return l->top == l->keys; +} + +static inline size_t bch2_keylist_u64s(struct keylist *l) +{ + return l->top_p - l->keys_p; +} + +static inline size_t bch2_keylist_bytes(struct keylist *l) +{ + return bch2_keylist_u64s(l) * sizeof(u64); +} + +static inline struct bkey_i *bch2_keylist_front(struct keylist *l) +{ + return l->keys; +} + +#define for_each_keylist_key(_keylist, _k) \ + for (_k = (_keylist)->keys; \ + _k != (_keylist)->top; \ + _k = bkey_next(_k)) + +static inline u64 keylist_sectors(struct keylist *keys) +{ + struct bkey_i *k; + u64 ret = 0; + + for_each_keylist_key(keys, k) + ret += k->k.size; + + return ret; +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_verify_keylist_sorted(struct keylist *); +#else +static inline void bch2_verify_keylist_sorted(struct keylist *l) {} +#endif + +#endif /* _BCACHEFS_KEYLIST_H */ diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h new file mode 100644 index 000000000000..4b3ff7d8a875 --- /dev/null +++ b/fs/bcachefs/keylist_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_KEYLIST_TYPES_H +#define _BCACHEFS_KEYLIST_TYPES_H + +struct keylist { + union { + struct bkey_i *keys; + u64 *keys_p; + }; + union { + struct bkey_i *top; + u64 *top_p; + }; +}; + +#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c new file mode 100644 index 000000000000..8640f7dee0de --- /dev/null +++ b/fs/bcachefs/logged_ops.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "error.h" +#include "io_misc.h" +#include "logged_ops.h" +#include "super.h" + +struct bch_logged_op_fn { + u8 type; + int (*resume)(struct btree_trans *, struct bkey_i *); +}; + +static const struct bch_logged_op_fn logged_op_fns[] = { +#define x(n) { \ + .type = KEY_TYPE_logged_op_##n, \ + .resume = bch2_resume_logged_op_##n, \ +}, + BCH_LOGGED_OPS() +#undef x +}; + +static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type) +{ + for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++) + if (logged_op_fns[i].type == type) + return logged_op_fns + i; + return NULL; +} + +static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); + struct bkey_buf sk; + u32 restart_count = trans->restart_count; + int ret; + + if (!fn) + return 0; + + bch2_bkey_buf_init(&sk); + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?: + fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +int bch2_resume_logged_ops(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, + BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k, + resume_logged_op(trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) +{ + struct btree_iter iter; + int ret; + + ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); + if (ret) + return ret; + + k->k.p = iter.pos; + + ret = bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) +{ + return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_logged_op_start(trans, k)); +} + +void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) +{ + int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); + /* + * This needs to be a fatal error because we've left an unfinished + * operation in the logged ops btree. + * + * We should only ever see an error here if the filesystem has already + * been shut down, but make sure of that here: + */ + if (ret) { + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s", + __func__, buf.buf, bch2_err_str(ret)); + printbuf_exit(&buf); + } +} diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h new file mode 100644 index 000000000000..4d1e786a27a8 --- /dev/null +++ b/fs/bcachefs/logged_ops.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_H +#define _BCACHEFS_LOGGED_OPS_H + +#include "bkey.h" + +#define BCH_LOGGED_OPS() \ + x(truncate) \ + x(finsert) + +static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) +{ + return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); +} + +int bch2_resume_logged_ops(struct bch_fs *); +int bch2_logged_op_start(struct btree_trans *, struct bkey_i *); +void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); + +#endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c new file mode 100644 index 000000000000..a5cc0ed195d6 --- /dev/null +++ b/fs/bcachefs/lru.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "error.h" +#include "lru.h" +#include "recovery.h" + +/* KEY_TYPE_lru is obsolete: */ +int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err, + lru_entry_at_time_0, + "lru entry at time=0"); +fsck_err: + return ret; +} + +void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); +} + +void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) +{ + prt_printf(out, "%llu:%llu -> %llu:%llu", + lru_pos_id(lru), + lru_pos_time(lru), + u64_to_bucket(lru.offset).inode, + u64_to_bucket(lru.offset).offset); +} + +static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, + u64 dev_bucket, u64 time, bool set) +{ + return time + ? bch2_btree_bit_mod(trans, BTREE_ID_lru, + lru_pos(lru_id, dev_bucket, time), set) + : 0; +} + +int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +{ + return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); +} + +int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +{ + return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); +} + +int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) +{ + if (old_time == new_time) + return 0; + + return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: + bch2_lru_set(trans, lru_id, dev_bucket, new_time); +} + +static const char * const bch2_lru_types[] = { +#define x(n) #n, + BCH_LRU_TYPES() +#undef x + NULL +}; + +static int bch2_check_lru_key(struct btree_trans *trans, + struct btree_iter *lru_iter, + struct bkey_s_c lru_k, + struct bpos *last_flushed_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + enum bch_lru_type type = lru_type(lru_k); + struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); + u64 idx; + int ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, + lru_entry_to_invalid_bucket, + "lru key points to nonexistent device:bucket %llu:%llu", + alloc_pos.inode, alloc_pos.offset)) + return bch2_btree_delete_at(trans, lru_iter, 0); + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); + ret = bkey_err(k); + if (ret) + goto err; + + a = bch2_alloc_to_v4(k, &a_convert); + + switch (type) { + case BCH_LRU_read: + idx = alloc_lru_idx_read(*a); + break; + case BCH_LRU_fragmentation: + idx = a->fragmentation_lru; + break; + } + + if (lru_k.k->type != KEY_TYPE_set || + lru_pos_time(lru_k.k->p) != idx) { + if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) { + *last_flushed_pos = lru_k.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + + if (c->opts.reconstruct_alloc || + fsck_err(c, lru_entry_bad, + "incorrect lru entry: lru %s time %llu\n" + " %s\n" + " for %s", + bch2_lru_types[type], + lru_pos_time(lru_k.k->p), + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) + ret = bch2_btree_delete_at(trans, lru_iter, 0); + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +int bch2_check_lrus(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bpos last_flushed_pos = POS_MIN; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_lru_key(trans, &iter, k, &last_flushed_pos))); + if (ret) + bch_err_fn(c, ret); + return ret; + +} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h new file mode 100644 index 000000000000..429dca816df5 --- /dev/null +++ b/fs/bcachefs/lru.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_H +#define _BCACHEFS_LRU_H + +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + +static inline u64 lru_pos_id(struct bpos pos) +{ + return pos.inode >> LRU_TIME_BITS; +} + +static inline u64 lru_pos_time(struct bpos pos) +{ + return pos.inode & ~(~0ULL << LRU_TIME_BITS); +} + +static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) +{ + struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); + + EBUG_ON(time > LRU_TIME_MAX); + EBUG_ON(lru_pos_id(pos) != lru_id); + EBUG_ON(lru_pos_time(pos) != time); + EBUG_ON(pos.offset != dev_bucket); + + return pos; +} + +#define BCH_LRU_TYPES() \ + x(read) \ + x(fragmentation) + +enum bch_lru_type { +#define x(n) BCH_LRU_##n, + BCH_LRU_TYPES() +#undef x +}; + +#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) + +static inline enum bch_lru_type lru_type(struct bkey_s_c l) +{ + u16 lru_id = l.k->p.inode >> 48; + + if (lru_id == BCH_LRU_FRAGMENTATION_START) + return BCH_LRU_fragmentation; + return BCH_LRU_read; +} + +int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +void bch2_lru_pos_to_text(struct printbuf *, struct bpos); + +#define bch2_bkey_ops_lru ((struct bkey_ops) { \ + .key_invalid = bch2_lru_invalid, \ + .val_to_text = bch2_lru_to_text, \ + .min_val_size = 8, \ +}) + +int bch2_lru_del(struct btree_trans *, u16, u64, u64); +int bch2_lru_set(struct btree_trans *, u16, u64, u64); +int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); + +int bch2_check_lrus(struct bch_fs *); + +#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c new file mode 100644 index 000000000000..1f0801e2e565 --- /dev/null +++ b/fs/bcachefs/mean_and_variance.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions for incremental mean and variance. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Copyright © 2022 Daniel B. Hill + * + * Author: Daniel B. Hill <daniel@gluo.nz> + * + * Description: + * + * This is includes some incremental algorithms for mean and variance calculation + * + * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf + * + * Create a struct and if it's the weighted variant set the w field (weight = 2^k). + * + * Use mean_and_variance[_weighted]_update() on the struct to update it's state. + * + * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation + * is deferred to these functions for performance reasons. + * + * see lib/math/mean_and_variance_test.c for examples of usage. + * + * DO NOT access the mean and variance fields of the weighted variants directly. + * DO NOT change the weight after calling update. + */ + +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/export.h> +#include <linux/limits.h> +#include <linux/math.h> +#include <linux/math64.h> +#include <linux/module.h> + +#include "mean_and_variance.h" + +u128_u u128_div(u128_u n, u64 d) +{ + u128_u r; + u64 rem; + u64 hi = u128_hi(n); + u64 lo = u128_lo(n); + u64 h = hi & ((u64) U32_MAX << 32); + u64 l = (hi & (u64) U32_MAX) << 32; + + r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); + r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); + r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); + return r; +} +EXPORT_SYMBOL_GPL(u128_div); + +/** + * mean_and_variance_get_mean() - get mean from @s + */ +s64 mean_and_variance_get_mean(struct mean_and_variance s) +{ + return s.n ? div64_u64(s.sum, s.n) : 0; +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); + +/** + * mean_and_variance_get_variance() - get variance from @s1 + * + * see linked pdf equation 12. + */ +u64 mean_and_variance_get_variance(struct mean_and_variance s1) +{ + if (s1.n) { + u128_u s2 = u128_div(s1.sum_squares, s1.n); + u64 s3 = abs(mean_and_variance_get_mean(s1)); + + return u128_lo(u128_sub(s2, u128_square(s3))); + } else { + return 0; + } +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); + +/** + * mean_and_variance_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_get_stddev(struct mean_and_variance s) +{ + return int_sqrt64(mean_and_variance_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); + +/** + * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() + * @s1: .. + * @s2: .. + * + * see linked pdf: function derived from equations 140-143 where alpha = 2^w. + * values are stored bitshifted for performance and added precision. + */ +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) +{ + // previous weighted variance. + u8 w = s->weight; + u64 var_w0 = s->variance; + // new value weighted. + s64 x_w = x << w; + s64 diff_w = x_w - s->mean; + s64 diff = fast_divpow2(diff_w, w); + // new mean weighted. + s64 u_w1 = s->mean + diff; + + if (!s->init) { + s->mean = x_w; + s->variance = 0; + } else { + s->mean = u_w1; + s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; + } + s->init = true; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); + +/** + * mean_and_variance_weighted_get_mean() - get mean from @s + */ +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +{ + return fast_divpow2(s.mean, s.weight); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); + +/** + * mean_and_variance_weighted_get_variance() -- get variance from @s + */ +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +{ + // always positive don't need fast divpow2 + return s.variance >> s.weight; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); + +/** + * mean_and_variance_weighted_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +{ + return int_sqrt64(mean_and_variance_weighted_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); + +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h new file mode 100644 index 000000000000..647505010b39 --- /dev/null +++ b/fs/bcachefs/mean_and_variance.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MEAN_AND_VARIANCE_H_ +#define MEAN_AND_VARIANCE_H_ + +#include <linux/types.h> +#include <linux/limits.h> +#include <linux/math.h> +#include <linux/math64.h> + +#define SQRT_U64_MAX 4294967295ULL + +/* + * u128_u: u128 user mode, because not all architectures support a real int128 + * type + */ + +#ifdef __SIZEOF_INT128__ + +typedef struct { + unsigned __int128 v; +} __aligned(16) u128_u; + +static inline u128_u u64_to_u128(u64 a) +{ + return (u128_u) { .v = a }; +} + +static inline u64 u128_lo(u128_u a) +{ + return a.v; +} + +static inline u64 u128_hi(u128_u a) +{ + return a.v >> 64; +} + +static inline u128_u u128_add(u128_u a, u128_u b) +{ + a.v += b.v; + return a; +} + +static inline u128_u u128_sub(u128_u a, u128_u b) +{ + a.v -= b.v; + return a; +} + +static inline u128_u u128_shl(u128_u a, s8 shift) +{ + a.v <<= shift; + return a; +} + +static inline u128_u u128_square(u64 a) +{ + u128_u b = u64_to_u128(a); + + b.v *= b.v; + return b; +} + +#else + +typedef struct { + u64 hi, lo; +} __aligned(16) u128_u; + +/* conversions */ + +static inline u128_u u64_to_u128(u64 a) +{ + return (u128_u) { .lo = a }; +} + +static inline u64 u128_lo(u128_u a) +{ + return a.lo; +} + +static inline u64 u128_hi(u128_u a) +{ + return a.hi; +} + +/* arithmetic */ + +static inline u128_u u128_add(u128_u a, u128_u b) +{ + u128_u c; + + c.lo = a.lo + b.lo; + c.hi = a.hi + b.hi + (c.lo < a.lo); + return c; +} + +static inline u128_u u128_sub(u128_u a, u128_u b) +{ + u128_u c; + + c.lo = a.lo - b.lo; + c.hi = a.hi - b.hi - (c.lo > a.lo); + return c; +} + +static inline u128_u u128_shl(u128_u i, s8 shift) +{ + u128_u r; + + r.lo = i.lo << shift; + if (shift < 64) + r.hi = (i.hi << shift) | (i.lo >> (64 - shift)); + else { + r.hi = i.lo << (shift - 64); + r.lo = 0; + } + return r; +} + +static inline u128_u u128_square(u64 i) +{ + u128_u r; + u64 h = i >> 32, l = i & U32_MAX; + + r = u128_shl(u64_to_u128(h*h), 64); + r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); + r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); + r = u128_add(r, u64_to_u128(l*l)); + return r; +} + +#endif + +static inline u128_u u64s_to_u128(u64 hi, u64 lo) +{ + u128_u c = u64_to_u128(hi); + + c = u128_shl(c, 64); + c = u128_add(c, u64_to_u128(lo)); + return c; +} + +u128_u u128_div(u128_u n, u64 d); + +struct mean_and_variance { + s64 n; + s64 sum; + u128_u sum_squares; +}; + +/* expontentially weighted variant */ +struct mean_and_variance_weighted { + bool init; + u8 weight; /* base 2 logarithim */ + s64 mean; + u64 variance; +}; + +/** + * fast_divpow2() - fast approximation for n / (1 << d) + * @n: numerator + * @d: the power of 2 denominator. + * + * note: this rounds towards 0. + */ +static inline s64 fast_divpow2(s64 n, u8 d) +{ + return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; +} + +/** + * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 + * and return it. + * @s1: the mean_and_variance to update. + * @v1: the new sample. + * + * see linked pdf equation 12. + */ +static inline void +mean_and_variance_update(struct mean_and_variance *s, s64 v) +{ + s->n++; + s->sum += v; + s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v))); +} + +s64 mean_and_variance_get_mean(struct mean_and_variance s); +u64 mean_and_variance_get_variance(struct mean_and_variance s1); +u32 mean_and_variance_get_stddev(struct mean_and_variance s); + +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); + +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); + +#endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c new file mode 100644 index 000000000000..019583c3ca0e --- /dev/null +++ b/fs/bcachefs/mean_and_variance_test.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "mean_and_variance.h" + +#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX) + +static void mean_and_variance_basic_test(struct kunit *test) +{ + struct mean_and_variance s = {}; + + mean_and_variance_update(&s, 2); + mean_and_variance_update(&s, 2); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0); + KUNIT_EXPECT_EQ(test, s.n, 2); + + mean_and_variance_update(&s, 4); + mean_and_variance_update(&s, 4); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1); + KUNIT_EXPECT_EQ(test, s.n, 4); +} + +/* + * Test values computed using a spreadsheet from the psuedocode at the bottom: + * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf + */ + +static void mean_and_variance_weighted_test(struct kunit *test) +{ + struct mean_and_variance_weighted s = { .weight = 2 }; + + mean_and_variance_weighted_update(&s, 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + + mean_and_variance_weighted_update(&s, 20); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + + mean_and_variance_weighted_update(&s, 30); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + + s = (struct mean_and_variance_weighted) { .weight = 2 }; + + mean_and_variance_weighted_update(&s, -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + + mean_and_variance_weighted_update(&s, -20); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + + mean_and_variance_weighted_update(&s, -30); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); +} + +static void mean_and_variance_weighted_advanced_test(struct kunit *test) +{ + struct mean_and_variance_weighted s = { .weight = 8 }; + s64 i; + + for (i = 10; i <= 100; i += 10) + mean_and_variance_weighted_update(&s, i); + + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + + s = (struct mean_and_variance_weighted) { .weight = 8 }; + + for (i = -10; i >= -100; i -= 10) + mean_and_variance_weighted_update(&s, i); + + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); +} + +static void do_mean_and_variance_test(struct kunit *test, + s64 initial_value, + s64 initial_n, + s64 n, + unsigned weight, + s64 *data, + s64 *mean, + s64 *stddev, + s64 *weighted_mean, + s64 *weighted_stddev) +{ + struct mean_and_variance mv = {}; + struct mean_and_variance_weighted vw = { .weight = weight }; + + for (unsigned i = 0; i < initial_n; i++) { + mean_and_variance_update(&mv, initial_value); + mean_and_variance_weighted_update(&vw, initial_value); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); + } + + for (unsigned i = 0; i < n; i++) { + mean_and_variance_update(&mv, data[i]); + mean_and_variance_weighted_update(&vw, data[i]); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); + } + + KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); +} + +/* Test behaviour with a single outlier, then back to steady state: */ +static void mean_and_variance_test_1(struct kunit *test) +{ + s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; + s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 }; + s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 }; + s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; + s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +static void mean_and_variance_test_2(struct kunit *test) +{ + s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; + s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; + s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; + s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; + s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +/* Test behaviour where we switch from one steady state to another: */ +static void mean_and_variance_test_3(struct kunit *test) +{ + s64 d[] = { 100, 100, 100, 100, 100 }; + s64 mean[] = { 22, 32, 40, 46, 50 }; + s64 stddev[] = { 32, 39, 42, 44, 45 }; + s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; + s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +static void mean_and_variance_test_4(struct kunit *test) +{ + s64 d[] = { 100, 100, 100, 100, 100 }; + s64 mean[] = { 10, 11, 12, 13, 14 }; + s64 stddev[] = { 9, 13, 15, 17, 19 }; + s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; + s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +static void mean_and_variance_fast_divpow2(struct kunit *test) +{ + s64 i; + u8 d; + + for (i = 0; i < 100; i++) { + d = 0; + KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d)); + KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d)); + for (d = 1; d < 32; d++) { + KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)), + div_u64(i, 1 << d), "%lld %u", i, d); + KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)), + div_u64(i, 1 << d), "%lld %u", -i, d); + } + } +} + +static void mean_and_variance_u128_basic_test(struct kunit *test) +{ + u128_u a = u64s_to_u128(0, U64_MAX); + u128_u a1 = u64s_to_u128(0, 1); + u128_u b = u64s_to_u128(1, 0); + u128_u c = u64s_to_u128(0, 1LLU << 63); + u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0); + KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0); + + KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX); + KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1); + + KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31); +} + +static struct kunit_case mean_and_variance_test_cases[] = { + KUNIT_CASE(mean_and_variance_fast_divpow2), + KUNIT_CASE(mean_and_variance_u128_basic_test), + KUNIT_CASE(mean_and_variance_basic_test), + KUNIT_CASE(mean_and_variance_weighted_test), + KUNIT_CASE(mean_and_variance_weighted_advanced_test), + KUNIT_CASE(mean_and_variance_test_1), + KUNIT_CASE(mean_and_variance_test_2), + KUNIT_CASE(mean_and_variance_test_3), + KUNIT_CASE(mean_and_variance_test_4), + {} +}; + +static struct kunit_suite mean_and_variance_test_suite = { + .name = "mean and variance tests", + .test_cases = mean_and_variance_test_cases +}; + +kunit_test_suite(mean_and_variance_test_suite); + +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c new file mode 100644 index 000000000000..e3a51f6d6c9b --- /dev/null +++ b/fs/bcachefs/migrate.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for moving data off a device. + */ + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "errcode.h" +#include "extents.h" +#include "io_write.h" +#include "journal.h" +#include "keylist.h" +#include "migrate.h" +#include "move.h" +#include "replicas.h" +#include "super-io.h" + +static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, + unsigned dev_idx, int flags, bool metadata) +{ + unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; + unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; + unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; + unsigned nr_good; + + bch2_bkey_drop_device(k, dev_idx); + + nr_good = bch2_bkey_durability(c, k.s_c); + if ((!nr_good && !(flags & lost)) || + (nr_good < replicas && !(flags & degraded))) + return -EINVAL; + + return 0; +} + +static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + int flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + if (!bch2_bkey_has_device_c(k, dev_idx)) + return 0; + + n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); + if (ret) + return ret; + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k)) + n->k.size = 0; + return 0; +} + +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id id; + int ret = 0; + + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_ptrs(id)) + continue; + + ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); + if (ret) + break; + } + + bch2_trans_put(trans); + + return ret; +} + +static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct closure cl; + struct btree *b; + struct bkey_buf k; + unsigned id; + int ret; + + /* don't handle this yet: */ + if (flags & BCH_FORCE_IF_METADATA_LOST) + return -EINVAL; + + trans = bch2_trans_get(c); + bch2_bkey_buf_init(&k); + closure_init_stack(&cl); + + for (id = 0; id < BTREE_ID_NR; id++) { + bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +retry: + ret = 0; + while (bch2_trans_begin(trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) + goto next; + + bch2_bkey_buf_copy(&k, c, &b->key); + + ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), + dev_idx, flags, true); + if (ret) { + bch_err(c, "Cannot drop device without losing data"); + break; + } + + ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + + if (ret) { + bch_err_msg(c, ret, "updating btree node key"); + break; + } +next: + bch2_btree_iter_next_node(&iter); + } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(trans, &iter); + + if (ret) + goto err; + } + + bch2_btree_interior_updates_flush(c); + ret = 0; +err: + bch2_bkey_buf_exit(&k, c); + bch2_trans_put(trans); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + return ret; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: + bch2_dev_metadata_drop(c, dev_idx, flags); +} diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h new file mode 100644 index 000000000000..027efaa0d575 --- /dev/null +++ b/fs/bcachefs/migrate.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MIGRATE_H +#define _BCACHEFS_MIGRATE_H + +int bch2_dev_data_drop(struct bch_fs *, unsigned, int); + +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 index 000000000000..54830ee0ed88 --- /dev/null +++ b/fs/bcachefs/move.c @@ -0,0 +1,1154 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "bkey_buf.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "disk_groups.h" +#include "ec.h" +#include "errcode.h" +#include "error.h" +#include "inode.h" +#include "io_read.h" +#include "io_write.h" +#include "journal_reclaim.h" +#include "keylist.h" +#include "move.h" +#include "replicas.h" +#include "snapshot.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/ioprio.h> +#include <linux/kthread.h> + +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_read_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_read(c, buf.buf); + printbuf_exit(&buf); + } +} + +struct moving_io { + struct list_head read_list; + struct list_head io_list; + struct move_bucket_in_flight *b; + struct closure cl; + bool read_completed; + + unsigned read_sectors; + unsigned write_sectors; + + struct bch_read_bio rbio; + + struct data_update write; + /* Must be last since it is variable size */ + struct bio_vec bi_inline_vecs[0]; +}; + +static void move_free(struct moving_io *io) +{ + struct moving_context *ctxt = io->write.ctxt; + + if (io->b) + atomic_dec(&io->b->count); + + bch2_data_update_exit(&io->write); + + mutex_lock(&ctxt->lock); + list_del(&io->io_list); + wake_up(&ctxt->wait); + mutex_unlock(&ctxt->lock); + + kfree(io); +} + +static void move_write_done(struct bch_write_op *op) +{ + struct moving_io *io = container_of(op, struct moving_io, write.op); + struct moving_context *ctxt = io->write.ctxt; + + if (io->write.op.error) + ctxt->write_error = true; + + atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_dec(&io->write.ctxt->write_ios); + move_free(io); + closure_put(&ctxt->cl); +} + +static void move_write(struct moving_io *io) +{ + if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + move_free(io); + return; + } + + closure_get(&io->write.ctxt->cl); + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); + + bch2_data_update_read_done(&io->write, io->rbio.pick.crc); +} + +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) +{ + struct moving_io *io = + list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); + + return io && io->read_completed ? io : NULL; +} + +static void move_read_endio(struct bio *bio) +{ + struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_context *ctxt = io->write.ctxt; + + atomic_sub(io->read_sectors, &ctxt->read_sectors); + atomic_dec(&ctxt->read_ios); + io->read_completed = true; + + wake_up(&ctxt->wait); + closure_put(&ctxt->cl); +} + +void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt) +{ + struct moving_io *io; + + while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { + bch2_trans_unlock_long(ctxt->trans); + list_del(&io->read_list); + move_write(io); + } +} + +void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +{ + unsigned sectors_pending = atomic_read(&ctxt->write_sectors); + + move_ctxt_wait_event(ctxt, + !atomic_read(&ctxt->write_sectors) || + atomic_read(&ctxt->write_sectors) != sectors_pending); +} + +static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt) +{ + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + bch2_trans_unlock_long(ctxt->trans); + closure_sync(&ctxt->cl); +} + +void bch2_moving_ctxt_exit(struct moving_context *ctxt) +{ + struct bch_fs *c = ctxt->trans->c; + + bch2_moving_ctxt_flush_all(ctxt); + + EBUG_ON(atomic_read(&ctxt->write_sectors)); + EBUG_ON(atomic_read(&ctxt->write_ios)); + EBUG_ON(atomic_read(&ctxt->read_sectors)); + EBUG_ON(atomic_read(&ctxt->read_ios)); + + mutex_lock(&c->moving_context_lock); + list_del(&ctxt->list); + mutex_unlock(&c->moving_context_lock); + + bch2_trans_put(ctxt->trans); + memset(ctxt, 0, sizeof(*ctxt)); +} + +void bch2_moving_ctxt_init(struct moving_context *ctxt, + struct bch_fs *c, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->trans = bch2_trans_get(c); + ctxt->fn = (void *) _RET_IP_; + ctxt->rate = rate; + ctxt->stats = stats; + ctxt->wp = wp; + ctxt->wait_on_copygc = wait_on_copygc; + + closure_init_stack(&ctxt->cl); + + mutex_init(&ctxt->lock); + INIT_LIST_HEAD(&ctxt->reads); + INIT_LIST_HEAD(&ctxt->ios); + init_waitqueue_head(&ctxt->wait); + + mutex_lock(&c->moving_context_lock); + list_add(&ctxt->list, &c->moving_context_list); + mutex_unlock(&c->moving_context_lock); +} + +void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) +{ + trace_move_data(c, stats); +} + +void bch2_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); + stats->data_type = BCH_DATA_user; + scnprintf(stats->name, sizeof(stats->name), "%s", name); +} + +int bch2_move_extent(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_io_opts io_opts, + struct data_update_opts data_opts) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct moving_io *io; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + + if (ctxt->stats) + ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); + trace_move_extent2(c, k); + + bch2_data_update_opts_normalize(k, &data_opts); + + if (!data_opts.rewrite_ptrs && + !data_opts.extra_replicas) { + if (data_opts.kill_ptrs) + return bch2_extent_drop_ptrs(trans, iter, k, data_opts); + return 0; + } + + /* + * Before memory allocations & taking nocow locks in + * bch2_data_update_init(): + */ + bch2_trans_unlock(trans); + + /* write path might have to decompress data: */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); + + pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + io = kzalloc(sizeof(struct moving_io) + + sizeof(struct bio_vec) * pages, GFP_KERNEL); + if (!io) + goto err; + + INIT_LIST_HEAD(&io->io_list); + io->write.ctxt = ctxt; + io->read_sectors = k.k->size; + io->write_sectors = k.k->size; + + bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); + bio_set_prio(&io->write.op.wbio.bio, + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, + GFP_KERNEL)) + goto err_free; + + io->rbio.c = c; + io->rbio.opts = io_opts; + bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); + io->rbio.bio.bi_vcnt = pages; + bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->rbio.bio.bi_iter.bi_size = sectors << 9; + + io->rbio.bio.bi_opf = REQ_OP_READ; + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + io->rbio.bio.bi_end_io = move_read_endio; + + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, iter->btree_id, k); + if (ret) + goto err_free_pages; + + io->write.op.end_io = move_write_done; + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); + + if (ctxt->stats) { + atomic64_inc(&ctxt->stats->keys_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); + } + + if (bucket_in_flight) { + io->b = bucket_in_flight; + atomic_inc(&io->b->count); + } + + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); + this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); + trace_move_extent_read2(c, k); + + mutex_lock(&ctxt->lock); + atomic_add(io->read_sectors, &ctxt->read_sectors); + atomic_inc(&ctxt->read_ios); + + list_add_tail(&io->read_list, &ctxt->reads); + list_add_tail(&io->io_list, &ctxt->ios); + mutex_unlock(&ctxt->lock); + + /* + * dropped by move_read_endio() - guards against use after free of + * ctxt when doing wakeup + */ + closure_get(&ctxt->cl); + bch2_read_extent(trans, &io->rbio, + bkey_start_pos(k.k), + iter->btree_id, k, 0, + BCH_READ_NODECODE| + BCH_READ_LAST_FRAGMENT); + return 0; +err_free_pages: + bio_free_pages(&io->write.op.wbio.bio); +err_free: + kfree(io); +err: + if (ret == -BCH_ERR_data_update_done) + return 0; + + if (bch2_err_matches(ret, EROFS) || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); + if (trace_move_extent_start_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, ": "); + prt_str(&buf, bch2_err_str(ret)); + trace_move_extent_start_fail(c, buf.buf); + printbuf_exit(&buf); + } + return ret; +} + +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, + struct bkey_s_c extent_k) +{ + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; + int ret = 0; + + if (io_opts->cur_inum != extent_k.k->p.inode) { + struct btree_iter iter; + struct bkey_s_c k; + + io_opts->d.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != extent_k.k->p.inode) + break; + + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + BUG_ON(bch2_inode_unpack(k, &inode)); + + struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; + bch2_inode_opts_get(&e.io_opts, trans->c, &inode); + + ret = darray_push(&io_opts->d, e); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + io_opts->cur_inum = extent_k.k->p.inode; + } + + ret = ret ?: trans_was_restarted(trans, restart_count); + if (ret) + return ERR_PTR(ret); + + if (extent_k.k->p.snapshot) { + struct snapshot_io_opts_entry *i; + darray_for_each(io_opts->d, i) + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) + return &i->io_opts; + } + + return &io_opts->fs_io_opts; +} + +int bch2_move_get_io_opts_one(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct bkey_s_c extent_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + /* reflink btree? */ + if (!extent_k.k->p.inode) { + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + return 0; + } + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_CACHED); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + if (!ret && bkey_is_inode(k.k)) { + struct bch_inode_unpacked inode; + bch2_inode_unpack(k, &inode); + bch2_inode_opts_get(io_opts, trans->c, &inode); + } else { + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + } + + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +int bch2_move_ratelimit(struct moving_context *ctxt) +{ + struct bch_fs *c = ctxt->trans->c; + bool is_kthread = current->flags & PF_KTHREAD; + u64 delay; + + if (ctxt->wait_on_copygc && c->copygc_running) { + bch2_moving_ctxt_flush_all(ctxt); + wait_event_killable(c->copygc_running_wq, + !c->copygc_running || + (is_kthread && kthread_should_stop())); + } + + do { + delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; + + if (is_kthread && kthread_should_stop()) + return 1; + + if (delay) + move_ctxt_wait_event_timeout(ctxt, + freezing(current) || + (is_kthread && kthread_should_stop()), + delay); + + if (unlikely(freezing(current))) { + bch2_moving_ctxt_flush_all(ctxt); + try_to_freeze(); + } + } while (delay); + + /* + * XXX: these limits really ought to be per device, SSDs and hard drives + * will want different limits + */ + move_ctxt_wait_event(ctxt, + atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && + atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); + + return 0; +} + +static int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct per_snapshot_io_opts snapshot_io_opts; + struct bch_io_opts *io_opts; + struct bkey_buf sk; + struct btree_iter iter; + struct bkey_s_c k; + struct data_update_opts data_opts; + int ret = 0, ret2; + + per_snapshot_io_opts_init(&snapshot_io_opts, c); + bch2_bkey_buf_init(&sk); + + if (ctxt->stats) { + ctxt->stats->data_type = BCH_DATA_user; + ctxt->stats->pos = BBPOS(btree_id, start); + } + + bch2_trans_iter_init(trans, &iter, btree_id, start, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + + if (ctxt->rate) + bch2_ratelimit_reset(ctxt->rate); + + while (!bch2_move_ratelimit(ctxt)) { + bch2_trans_begin(trans); + + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_ge(bkey_start_pos(k.k), end)) + break; + + if (ctxt->stats) + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + + if (!bkey_extent_is_direct_data(k.k)) + goto next_nondata; + + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); + ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + continue; + + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, k, io_opts, &data_opts)) + goto next; + + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); + if (ret2) { + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) + continue; + + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; + } + + /* XXX signal failure */ + goto next; + } +next: + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); +next_nondata: + bch2_btree_iter_advance(&iter); + } + + bch2_trans_iter_exit(trans, &iter); + bch2_bkey_buf_exit(&sk, c); + per_snapshot_io_opts_exit(&snapshot_io_opts); + + return ret; +} + +int __bch2_move_data(struct moving_context *ctxt, + struct bbpos start, + struct bbpos end, + move_pred_fn pred, void *arg) +{ + struct bch_fs *c = ctxt->trans->c; + enum btree_id id; + int ret = 0; + + for (id = start.btree; + id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); + id++) { + ctxt->stats->pos = BBPOS(id, POS_MIN); + + if (!btree_type_has_ptrs(id) || + !bch2_btree_id_root(c, id)->b) + continue; + + ret = bch2_move_data_btree(ctxt, + id == start.btree ? start.pos : POS_MIN, + id == end.btree ? end.pos : POS_MAX, + pred, arg, id); + if (ret) + break; + } + + return ret; +} + +int bch2_move_data(struct bch_fs *c, + struct bbpos start, + struct bbpos end, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + + struct moving_context ctxt; + int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ret = __bch2_move_data(&ctxt, start, end, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + +int __bch2_evacuate_bucket(struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts _data_opts) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + bool is_kthread = current->flags & PF_KTHREAD; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_iter iter; + struct bkey_buf sk; + struct bch_backpointer bp; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c k; + struct data_update_opts data_opts; + unsigned dirty_sectors, bucket_size; + u64 fragmentation; + struct bpos bp_pos = POS_MIN; + int ret = 0; + + trace_bucket_evacuate(c, &bucket); + + bch2_bkey_buf_init(&sk); + + /* + * We're not run in a context that handles transaction restarts: + */ + bch2_trans_begin(trans); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + bch2_trans_iter_exit(trans, &iter); + + if (ret) { + bch_err_msg(c, ret, "looking up alloc key"); + goto err; + } + + a = bch2_alloc_to_v4(k, &a_convert); + dirty_sectors = a->dirty_sectors; + bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; + fragmentation = a->fragmentation_lru; + + ret = bch2_btree_write_buffer_flush(trans); + if (ret) { + bch_err_msg(c, ret, "flushing btree write buffer"); + goto err; + } + + while (!(ret = bch2_move_ratelimit(ctxt))) { + if (is_kthread && kthread_should_stop()) + break; + + bch2_trans_begin(trans); + + ret = bch2_get_next_backpointer(trans, bucket, gen, + &bp_pos, &bp, + BTREE_ITER_CACHED); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (bkey_eq(bp_pos, POS_MAX)) + break; + + if (!bp.level) { + const struct bch_extent_ptr *ptr; + unsigned i = 0; + + k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + goto next; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + if (ret) { + bch2_trans_iter_exit(trans, &iter); + continue; + } + + data_opts = _data_opts; + data_opts.target = io_opts.background_target; + data_opts.rewrite_ptrs = 0; + + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == bucket.inode) { + data_opts.rewrite_ptrs |= 1U << i; + if (ptr->cached) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + } + i++; + } + + ret = bch2_move_extent(ctxt, bucket_in_flight, + &iter, k, io_opts, data_opts); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; + } + if (ret) + goto err; + + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + } else { + struct btree *b; + + b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); + ret = PTR_ERR_OR_ZERO(b); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!b) + goto next; + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, + c->opts.btree_node_size >> 9); + if (ctxt->stats) { + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + } + } +next: + bp_pos = bpos_nosnap_successor(bp_pos); + } + + trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); +err: + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +int bch2_evacuate_bucket(struct bch_fs *c, + struct bpos bucket, int gen, + struct data_update_opts data_opts, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + struct moving_context ctxt; + int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + +typedef bool (*move_btree_pred)(struct bch_fs *, void *, + struct btree *, struct bch_io_opts *, + struct data_update_opts *); + +static int bch2_move_btree(struct bch_fs *c, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, + move_btree_pred pred, void *arg, + struct bch_move_stats *stats) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct moving_context ctxt; + struct btree_trans *trans; + struct btree_iter iter; + struct btree *b; + enum btree_id id; + struct data_update_opts data_opts; + int ret = 0; + + bch2_moving_ctxt_init(&ctxt, c, NULL, stats, + writepoint_ptr(&c->btree_write_point), + true); + trans = ctxt.trans; + + stats->data_type = BCH_DATA_btree; + + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + id++) { + stats->pos = BBPOS(id, POS_MIN); + + if (!bch2_btree_id_root(c, id)->b) + continue; + + bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +retry: + ret = 0; + while (bch2_trans_begin(trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { + if (kthread && kthread_should_stop()) + break; + + if ((cmp_int(id, end_btree_id) ?: + bpos_cmp(b->key.k.p, end_pos)) > 0) + break; + + stats->pos = BBPOS(iter.btree_id, iter.pos); + + if (!pred(c, arg, b, &io_opts, &data_opts)) + goto next; + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; +next: + bch2_btree_iter_next_node(&iter); + } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(trans, &iter); + + if (kthread && kthread_should_stop()) + break; + } + + bch_err_fn(c, ret); + bch2_moving_ctxt_exit(&ctxt); + bch2_btree_interior_updates_flush(c); + + return ret; +} + +static bool rereplicate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned nr_good = bch2_bkey_durability(c, k); + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; + + if (!nr_good || nr_good >= replicas) + return false; + + data_opts->target = 0; + data_opts->extra_replicas = replicas - nr_good; + data_opts->btree_insert_flags = 0; + return true; +} + +static bool migrate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + struct bch_ioctl_data *op = arg; + unsigned i = 0; + + data_opts->rewrite_ptrs = 0; + data_opts->target = 0; + data_opts->extra_replicas = 0; + data_opts->btree_insert_flags = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == op->migrate.dev) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + + return data_opts->rewrite_ptrs != 0; +} + +static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static bool migrate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static bool bformat_needs_redo(struct bkey_format *f) +{ + unsigned i; + + for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f->bits_per_field[i] > unpacked_bits) + return true; + + if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + return true; + + if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & + unpacked_mask) < + field_offset) + return true; + } + + return false; +} + +static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + if (b->version_ondisk != c->sb.version || + btree_node_need_rewrite(b) || + bformat_needs_redo(&b->format)) { + data_opts->target = 0; + data_opts->extra_replicas = 0; + data_opts->btree_insert_flags = 0; + return true; + } + + return false; +} + +int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) +{ + int ret; + + ret = bch2_move_btree(c, + 0, POS_MIN, + BTREE_ID_NR, SPOS_MAX, + rewrite_old_nodes_pred, c, stats); + if (!ret) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + bch_err_fn(c, ret); + return ret; +} + +int bch2_data_job(struct bch_fs *c, + struct bch_move_stats *stats, + struct bch_ioctl_data op) +{ + int ret = 0; + + switch (op.op) { + case BCH_DATA_OP_REREPLICATE: + bch2_move_stats_init(stats, "rereplicate"); + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, -1); + + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + rereplicate_btree_pred, c, stats) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + + ret = bch2_move_data(c, + (struct bbpos) { op.start_btree, op.start_pos }, + (struct bbpos) { op.end_btree, op.end_pos }, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + rereplicate_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + + bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_MIGRATE: + if (op.migrate.dev >= c->sb.nr_devices) + return -EINVAL; + + bch2_move_stats_init(stats, "migrate"); + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); + + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + migrate_btree_pred, &op, stats) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + + ret = bch2_move_data(c, + (struct bbpos) { op.start_btree, op.start_pos }, + (struct bbpos) { op.end_btree, op.end_pos }, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + + bch2_move_stats_exit(stats, c); + break; + case BCH_DATA_OP_REWRITE_OLD_NODES: + bch2_move_stats_init(stats, "rewrite_old_nodes"); + ret = bch2_scan_old_btree_nodes(c, stats); + bch2_move_stats_exit(stats, c); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) +{ + prt_printf(out, "%s: data type=%s pos=", + stats->name, + bch2_data_types[stats->data_type]); + bch2_bbpos_to_text(out, stats->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "keys moved: "); + prt_u64(out, atomic64_read(&stats->keys_moved)); + prt_newline(out); + + prt_str(out, "keys raced: "); + prt_u64(out, atomic64_read(&stats->keys_raced)); + prt_newline(out); + + prt_str(out, "bytes seen: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); + prt_newline(out); + + prt_str(out, "bytes moved: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); + prt_newline(out); + + prt_str(out, "bytes raced: "); + prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) +{ + struct moving_io *io; + + bch2_move_stats_to_text(out, ctxt->stats); + printbuf_indent_add(out, 2); + + prt_printf(out, "reads: ios %u/%u sectors %u/%u", + atomic_read(&ctxt->read_ios), + c->opts.move_ios_in_flight, + atomic_read(&ctxt->read_sectors), + c->opts.move_bytes_in_flight >> 9); + prt_newline(out); + + prt_printf(out, "writes: ios %u/%u sectors %u/%u", + atomic_read(&ctxt->write_ios), + c->opts.move_ios_in_flight, + atomic_read(&ctxt->write_sectors), + c->opts.move_bytes_in_flight >> 9); + prt_newline(out); + + printbuf_indent_add(out, 2); + + mutex_lock(&ctxt->lock); + list_for_each_entry(io, &ctxt->ios, io_list) + bch2_write_op_to_text(out, &io->write.op); + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); +} + +void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct moving_context *ctxt; + + mutex_lock(&c->moving_context_lock); + list_for_each_entry(ctxt, &c->moving_context_list, list) + bch2_moving_ctxt_to_text(out, c, ctxt); + mutex_unlock(&c->moving_context_lock); +} + +void bch2_fs_move_init(struct bch_fs *c) +{ + INIT_LIST_HEAD(&c->moving_context_list); + mutex_init(&c->moving_context_lock); +} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h new file mode 100644 index 000000000000..0906aa2d1de2 --- /dev/null +++ b/fs/bcachefs/move.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVE_H +#define _BCACHEFS_MOVE_H + +#include "bbpos.h" +#include "bcachefs_ioctl.h" +#include "btree_iter.h" +#include "buckets.h" +#include "data_update.h" +#include "move_types.h" + +struct bch_read_bio; + +struct moving_context { + struct btree_trans *trans; + struct list_head list; + void *fn; + + struct bch_ratelimit *rate; + struct bch_move_stats *stats; + struct write_point_specifier wp; + bool wait_on_copygc; + bool write_error; + + /* For waiting on outstanding reads and writes: */ + struct closure cl; + + struct mutex lock; + struct list_head reads; + struct list_head ios; + + /* in flight sectors: */ + atomic_t read_sectors; + atomic_t write_sectors; + atomic_t read_ios; + atomic_t write_ios; + + wait_queue_head_t wait; +}; + +#define move_ctxt_wait_event_timeout(_ctxt, _cond, _timeout) \ +({ \ + int _ret = 0; \ + while (true) { \ + bool cond_finished = false; \ + bch2_moving_ctxt_do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ + bch2_trans_unlock_long((_ctxt)->trans); \ + _ret = __wait_event_timeout((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond)), _timeout); \ + if (_ret || ( cond_finished)) \ + break; \ + } \ + _ret; \ +}) + +#define move_ctxt_wait_event(_ctxt, _cond) \ +do { \ + bool cond_finished = false; \ + bch2_moving_ctxt_do_pending_writes(_ctxt); \ + \ + if (_cond) \ + break; \ + bch2_trans_unlock_long((_ctxt)->trans); \ + __wait_event((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond))); \ + if (cond_finished) \ + break; \ +} while (1) + +typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, + struct bch_io_opts *, struct data_update_opts *); + +void bch2_moving_ctxt_exit(struct moving_context *); +void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool); +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); +void bch2_moving_ctxt_do_pending_writes(struct moving_context *); +void bch2_move_ctxt_wait_for_io(struct moving_context *); +int bch2_move_ratelimit(struct moving_context *); + +/* Inodes in different snapshots may have different IO options: */ +struct snapshot_io_opts_entry { + u32 snapshot; + struct bch_io_opts io_opts; +}; + +struct per_snapshot_io_opts { + u64 cur_inum; + struct bch_io_opts fs_io_opts; + DARRAY(struct snapshot_io_opts_entry) d; +}; + +static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c) +{ + memset(io_opts, 0, sizeof(*io_opts)); + io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts); +} + +static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts) +{ + darray_exit(&io_opts->d); +} + +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bkey_s_c); +int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); + +int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); + +int bch2_move_extent(struct moving_context *, + struct move_bucket_in_flight *, + struct btree_iter *, + struct bkey_s_c, + struct bch_io_opts, + struct data_update_opts); + +int __bch2_move_data(struct moving_context *, + struct bbpos, + struct bbpos, + move_pred_fn, void *); +int bch2_move_data(struct bch_fs *, + struct bbpos start, + struct bbpos end, + struct bch_ratelimit *, + struct bch_move_stats *, + struct write_point_specifier, + bool, + move_pred_fn, void *); + +int __bch2_evacuate_bucket(struct moving_context *, + struct move_bucket_in_flight *, + struct bpos, int, + struct data_update_opts); +int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, + struct data_update_opts, + struct bch_ratelimit *, + struct bch_move_stats *, + struct write_point_specifier, + bool); +int bch2_data_job(struct bch_fs *, + struct bch_move_stats *, + struct bch_ioctl_data); + +void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); +void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); +void bch2_move_stats_init(struct bch_move_stats *, char *); + +void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); + +void bch2_fs_move_init(struct bch_fs *); + +#endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h new file mode 100644 index 000000000000..e22841ef31e4 --- /dev/null +++ b/fs/bcachefs/move_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVE_TYPES_H +#define _BCACHEFS_MOVE_TYPES_H + +#include "bbpos_types.h" + +struct bch_move_stats { + enum bch_data_type data_type; + struct bbpos pos; + char name[32]; + + atomic64_t keys_moved; + atomic64_t keys_raced; + atomic64_t sectors_seen; + atomic64_t sectors_moved; + atomic64_t sectors_raced; +}; + +struct move_bucket_key { + struct bpos bucket; + u8 gen; +}; + +struct move_bucket { + struct move_bucket_key k; + unsigned sectors; +}; + +struct move_bucket_in_flight { + struct move_bucket_in_flight *next; + struct rhash_head hash; + struct move_bucket bucket; + atomic_t count; +}; + +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 index 000000000000..a84e79f79e5e --- /dev/null +++ b/fs/bcachefs/movinggc.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Moving/copying garbage collector + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "clock.h" +#include "errcode.h" +#include "error.h" +#include "lru.h" +#include "move.h" +#include "movinggc.h" +#include "trace.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/math64.h> +#include <linux/sched/task.h> +#include <linux/wait.h> + +struct buckets_in_flight { + struct rhashtable table; + struct move_bucket_in_flight *first; + struct move_bucket_in_flight *last; + size_t nr; + size_t sectors; +}; + +static const struct rhashtable_params bch_move_bucket_params = { + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), +}; + +static struct move_bucket_in_flight * +move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) +{ + struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); + int ret; + + if (!new) + return ERR_PTR(-ENOMEM); + + new->bucket = b; + + ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, + bch_move_bucket_params); + if (ret) { + kfree(new); + return ERR_PTR(ret); + } + + if (!list->first) + list->first = new; + else + list->last->next = new; + + list->last = new; + list->nr++; + list->sectors += b.sectors; + return new; +} + +static int bch2_bucket_is_movable(struct btree_trans *trans, + struct move_bucket *b, u64 time) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a; + int ret; + + if (bch2_bucket_is_open(trans->c, + b->k.bucket.inode, + b->k.bucket.offset)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; + + a = bch2_alloc_to_v4(k, &_a); + b->k.gen = a->gen; + b->sectors = a->dirty_sectors; + + ret = data_type_movable(a->data_type) && + a->fragmentation_lru && + a->fragmentation_lru <= time; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void move_buckets_wait(struct moving_context *ctxt, + struct buckets_in_flight *list, + bool flush) +{ + struct move_bucket_in_flight *i; + int ret; + + while ((i = list->first)) { + if (flush) + move_ctxt_wait_event(ctxt, !atomic_read(&i->count)); + + if (atomic_read(&i->count)) + break; + + list->first = i->next; + if (!list->first) + list->last = NULL; + + list->nr--; + list->sectors -= i->bucket.sectors; + + ret = rhashtable_remove_fast(&list->table, &i->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(i); + } + + bch2_trans_unlock_long(ctxt->trans); +} + +static bool bucket_in_flight(struct buckets_in_flight *list, + struct move_bucket_key k) +{ + return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); +} + +typedef DARRAY(struct move_bucket) move_buckets; + +static int bch2_copygc_get_buckets(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight, + move_buckets *buckets) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4); + size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; + int ret; + + move_buckets_wait(ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_flush(trans); + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + __func__, bch2_err_str(ret))) + return ret; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), + lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + 0, k, ({ + struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; + int ret2 = 0; + + saw++; + + if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) + not_movable++; + else if (bucket_in_flight(buckets_in_flight, b.k)) + in_flight++; + else { + ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; + if (ret2 >= 0) + sectors += b.sectors; + } + ret2; + })); + + pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", + buckets_in_flight->nr, buckets_in_flight->sectors, + saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); + + return ret < 0 ? ret : 0; +} + +noinline +static int bch2_copygc(struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight, + bool *did_work) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct data_update_opts data_opts = { + .btree_insert_flags = BCH_WATERMARK_copygc, + }; + move_buckets buckets = { 0 }; + struct move_bucket_in_flight *f; + struct move_bucket *i; + u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + int ret = 0; + + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); + if (ret) + goto err; + + darray_for_each(buckets, i) { + if (kthread_should_stop() || freezing(current)) + break; + + f = move_bucket_in_flight_add(buckets_in_flight, *i); + ret = PTR_ERR_OR_ZERO(f); + if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ + ret = 0; + continue; + } + if (ret == -ENOMEM) { /* flush IO, continue later */ + ret = 0; + break; + } + + ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, + f->bucket.k.gen, data_opts); + if (ret) + goto err; + + *did_work = true; + } +err: + darray_exit(&buckets); + + /* no entries in LRU btree found, or got to end: */ + if (bch2_err_matches(ret, ENOENT)) + ret = 0; + + if (ret < 0 && !bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "from bch2_move_data()"); + + moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; + trace_and_count(c, copygc, c, moved, 0, 0, 0); + return ret; +} + +/* + * Copygc runs when the amount of fragmented data is above some arbitrary + * threshold: + * + * The threshold at the limit - when the device is full - is the amount of space + * we reserved in bch2_recalc_capacity; we can't have more than that amount of + * disk space stranded due to fragmentation and store everything we have + * promised to store. + * + * But we don't want to be running copygc unnecessarily when the device still + * has plenty of free space - rather, we want copygc to smoothly run every so + * often and continually reduce the amount of fragmented space as the device + * fills up. So, we increase the threshold by half the current free space. + */ +unsigned long bch2_copygc_wait_amount(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned dev_idx; + s64 wait = S64_MAX, fragmented_allowed, fragmented; + unsigned i; + + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + + fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * + ca->mi.bucket_size) >> 1); + fragmented = 0; + + for (i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage.d[i].fragmented; + + wait = min(wait, max(0LL, fragmented_allowed - fragmented)); + } + + return wait; +} + +void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) +{ + prt_printf(out, "Currently waiting for: "); + prt_human_readable_u64(out, max(0LL, c->copygc_wait - + atomic64_read(&c->io_clock[WRITE].now)) << 9); + prt_newline(out); + + prt_printf(out, "Currently waiting since: "); + prt_human_readable_u64(out, max(0LL, + atomic64_read(&c->io_clock[WRITE].now) - + c->copygc_wait_at) << 9); + prt_newline(out); + + prt_printf(out, "Currently calculated wait: "); + prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); + prt_newline(out); +} + +static int bch2_copygc_thread(void *arg) +{ + struct bch_fs *c = arg; + struct moving_context ctxt; + struct bch_move_stats move_stats; + struct io_clock *clock = &c->io_clock[WRITE]; + struct buckets_in_flight *buckets; + u64 last, wait; + int ret = 0; + + buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); + if (!buckets) + return -ENOMEM; + ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + if (ret) { + kfree(buckets); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); + return ret; + } + + set_freezable(); + + bch2_move_stats_init(&move_stats, "copygc"); + bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, + writepoint_ptr(&c->copygc_write_point), + false); + + while (!ret && !kthread_should_stop()) { + bool did_work = false; + + bch2_trans_unlock_long(ctxt.trans); + cond_resched(); + + if (!c->copy_gc_enabled) { + move_buckets_wait(&ctxt, buckets, true); + kthread_wait_freezable(c->copy_gc_enabled); + } + + if (unlikely(freezing(current))) { + move_buckets_wait(&ctxt, buckets, true); + __refrigerator(false); + continue; + } + + last = atomic64_read(&clock->now); + wait = bch2_copygc_wait_amount(c); + + if (wait > clock->max_slop) { + c->copygc_wait_at = last; + c->copygc_wait = last + wait; + move_buckets_wait(&ctxt, buckets, true); + trace_and_count(c, copygc_wait, c, wait, last + wait); + bch2_kthread_io_clock_wait(clock, last + wait, + MAX_SCHEDULE_TIMEOUT); + continue; + } + + c->copygc_wait = 0; + + c->copygc_running = true; + ret = bch2_copygc(&ctxt, buckets, &did_work); + c->copygc_running = false; + + wake_up(&c->copygc_running_wq); + + if (!wait && !did_work) { + u64 min_member_capacity = bch2_min_rw_member_capacity(c); + + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + bch2_trans_unlock_long(ctxt.trans); + bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), + MAX_SCHEDULE_TIMEOUT); + } + } + + move_buckets_wait(&ctxt, buckets, true); + + rhashtable_destroy(&buckets->table); + kfree(buckets); + bch2_moving_ctxt_exit(&ctxt); + bch2_move_stats_exit(&move_stats, c); + + return 0; +} + +void bch2_copygc_stop(struct bch_fs *c) +{ + if (c->copygc_thread) { + kthread_stop(c->copygc_thread); + put_task_struct(c->copygc_thread); + } + c->copygc_thread = NULL; +} + +int bch2_copygc_start(struct bch_fs *c) +{ + struct task_struct *t; + int ret; + + if (c->copygc_thread) + return 0; + + if (c->opts.nochanges) + return 0; + + if (bch2_fs_init_fault("copygc_start")) + return -ENOMEM; + + t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); + ret = PTR_ERR_OR_ZERO(t); + if (ret) { + bch_err_msg(c, ret, "creating copygc thread"); + return ret; + } + + get_task_struct(t); + + c->copygc_thread = t; + wake_up_process(c->copygc_thread); + + return 0; +} + +void bch2_fs_copygc_init(struct bch_fs *c) +{ + init_waitqueue_head(&c->copygc_running_wq); + c->copygc_running = false; +} diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h new file mode 100644 index 000000000000..ea181fef5bc9 --- /dev/null +++ b/fs/bcachefs/movinggc.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVINGGC_H +#define _BCACHEFS_MOVINGGC_H + +unsigned long bch2_copygc_wait_amount(struct bch_fs *); +void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); + +void bch2_copygc_stop(struct bch_fs *); +int bch2_copygc_start(struct bch_fs *); +void bch2_fs_copygc_init(struct bch_fs *); + +#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c new file mode 100644 index 000000000000..3c21981a4a1c --- /dev/null +++ b/fs/bcachefs/nocow_locking.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "nocow_locking.h" +#include "util.h" + +#include <linux/closure.h> + +bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket && atomic_read(&l->l[i])) + return true; + return false; +} + +#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) + +void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + int lock_val = flags ? 1 : -1; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket) { + int v = atomic_sub_return(lock_val, &l->l[i]); + + BUG_ON(v && sign(v) != lock_val); + if (!v) + closure_wake_up(&l->wait); + return; + } + + BUG(); +} + +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) +{ + int v, lock_val = flags ? 1 : -1; + unsigned i; + + spin_lock(&l->lock); + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket) + goto got_entry; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (!atomic_read(&l->l[i])) { + l->b[i] = dev_bucket; + goto take_lock; + } +fail: + spin_unlock(&l->lock); + return false; +got_entry: + v = atomic_read(&l->l[i]); + if (lock_val > 0 ? v < 0 : v > 0) + goto fail; +take_lock: + v = atomic_read(&l->l[i]); + /* Overflow? */ + if (v && sign(v + lock_val) != sign(v)) + goto fail; + + atomic_add(lock_val, &l->l[i]); + spin_unlock(&l->lock); + return true; +} + +void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, + struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) +{ + if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { + struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); + u64 start_time = local_clock(); + + __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); + bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + } +} + +void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) + +{ + unsigned i, nr_zero = 0; + struct nocow_lock_bucket *l; + + for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) { + unsigned v = 0; + + for (i = 0; i < ARRAY_SIZE(l->l); i++) + v |= atomic_read(&l->l[i]); + + if (!v) { + nr_zero++; + continue; + } + + if (nr_zero) + prt_printf(out, "(%u empty entries)\n", nr_zero); + nr_zero = 0; + + for (i = 0; i < ARRAY_SIZE(l->l); i++) { + int v = atomic_read(&l->l[i]); + if (v) { + bch2_bpos_to_text(out, u64_to_bucket(l->b[i])); + prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v)); + } + } + prt_newline(out); + } + + if (nr_zero) + prt_printf(out, "(%u empty entries)\n", nr_zero); +} + +void bch2_fs_nocow_locking_exit(struct bch_fs *c) +{ + struct bucket_nocow_lock_table *t = &c->nocow_locks; + + for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) + for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++) + BUG_ON(atomic_read(&l->l[j])); +} + +int bch2_fs_nocow_locking_init(struct bch_fs *c) +{ + struct bucket_nocow_lock_table *t = &c->nocow_locks; + + for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) + spin_lock_init(&l->lock); + + return 0; +} diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h new file mode 100644 index 000000000000..f9d6a426a960 --- /dev/null +++ b/fs/bcachefs/nocow_locking.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_NOCOW_LOCKING_H +#define _BCACHEFS_NOCOW_LOCKING_H + +#include "bcachefs.h" +#include "alloc_background.h" +#include "nocow_locking_types.h" + +#include <linux/hash.h> + +static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t, + u64 dev_bucket) +{ + unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); + + return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); +} + +#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) + +bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); +void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); +void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, + struct nocow_lock_bucket *, u64, int); + +static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + + __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); +} + +static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + + return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); +} + +void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); + +void bch2_fs_nocow_locking_exit(struct bch_fs *); +int bch2_fs_nocow_locking_init(struct bch_fs *); + +#endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h new file mode 100644 index 000000000000..bd12bf677924 --- /dev/null +++ b/fs/bcachefs/nocow_locking_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H +#define _BCACHEFS_NOCOW_LOCKING_TYPES_H + +#define BUCKET_NOCOW_LOCKS_BITS 10 +#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) + +struct nocow_lock_bucket { + struct closure_waitlist wait; + spinlock_t lock; + u64 b[4]; + atomic_t l[4]; +} __aligned(SMP_CACHE_BYTES); + +struct bucket_nocow_lock_table { + struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS]; +}; + +#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */ + diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c new file mode 100644 index 000000000000..8dd4046cca41 --- /dev/null +++ b/fs/bcachefs/opts.c @@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/kernel.h> + +#include "bcachefs.h" +#include "compress.h" +#include "disk_groups.h" +#include "error.h" +#include "opts.h" +#include "super-io.h" +#include "util.h" + +#define x(t, n, ...) [n] = #t, + +const char * const bch2_error_actions[] = { + BCH_ERROR_ACTIONS() + NULL +}; + +const char * const bch2_fsck_fix_opts[] = { + BCH_FIX_ERRORS_OPTS() + NULL +}; + +const char * const bch2_version_upgrade_opts[] = { + BCH_VERSION_UPGRADE_OPTS() + NULL +}; + +const char * const bch2_sb_features[] = { + BCH_SB_FEATURES() + NULL +}; + +const char * const bch2_sb_compat[] = { + BCH_SB_COMPAT() + NULL +}; + +const char * const __bch2_btree_ids[] = { + BCH_BTREE_IDS() + NULL +}; + +const char * const bch2_csum_types[] = { + BCH_CSUM_TYPES() + NULL +}; + +const char * const bch2_csum_opts[] = { + BCH_CSUM_OPTS() + NULL +}; + +const char * const bch2_compression_types[] = { + BCH_COMPRESSION_TYPES() + NULL +}; + +const char * const bch2_compression_opts[] = { + BCH_COMPRESSION_OPTS() + NULL +}; + +const char * const bch2_str_hash_types[] = { + BCH_STR_HASH_TYPES() + NULL +}; + +const char * const bch2_str_hash_opts[] = { + BCH_STR_HASH_OPTS() + NULL +}; + +const char * const bch2_data_types[] = { + BCH_DATA_TYPES() + NULL +}; + +const char * const bch2_member_states[] = { + BCH_MEMBER_STATES() + NULL +}; + +const char * const bch2_jset_entry_types[] = { + BCH_JSET_ENTRY_TYPES() + NULL +}; + +const char * const bch2_fs_usage_types[] = { + BCH_FS_USAGE_TYPES() + NULL +}; + +#undef x + +static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, + struct printbuf *err) +{ + if (!val) { + *res = FSCK_FIX_yes; + } else { + int ret = match_string(bch2_fsck_fix_opts, -1, val); + + if (ret < 0 && err) + prt_str(err, "fix_errors: invalid selection"); + if (ret < 0) + return ret; + *res = ret; + } + + return 0; +} + +static void bch2_opt_fix_errors_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + prt_str(out, bch2_fsck_fix_opts[v]); +} + +#define bch2_opt_fix_errors (struct bch_opt_fn) { \ + .parse = bch2_opt_fix_errors_parse, \ + .to_text = bch2_opt_fix_errors_to_text, \ +} + +const char * const bch2_d_types[BCH_DT_MAX] = { + [DT_UNKNOWN] = "unknown", + [DT_FIFO] = "fifo", + [DT_CHR] = "chr", + [DT_DIR] = "dir", + [DT_BLK] = "blk", + [DT_REG] = "reg", + [DT_LNK] = "lnk", + [DT_SOCK] = "sock", + [DT_WHT] = "whiteout", + [DT_SUBVOL] = "subvol", +}; + +u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) +{ + BUG(); +} + +void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) +{ + BUG(); +} + +void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) +{ +#define x(_name, ...) \ + if (opt_defined(src, _name)) \ + opt_set(*dst, _name, src._name); + + BCH_OPTS() +#undef x +} + +bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) +{ + switch (id) { +#define x(_name, ...) \ + case Opt_##_name: \ + return opt_defined(*opts, _name); + BCH_OPTS() +#undef x + default: + BUG(); + } +} + +u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) +{ + switch (id) { +#define x(_name, ...) \ + case Opt_##_name: \ + return opts->_name; + BCH_OPTS() +#undef x + default: + BUG(); + } +} + +void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) +{ + switch (id) { +#define x(_name, ...) \ + case Opt_##_name: \ + opt_set(*opts, _name, v); \ + break; + BCH_OPTS() +#undef x + default: + BUG(); + } +} + +const struct bch_option bch2_opt_table[] = { +#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 +#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ + .min = _min, .max = _max +#define OPT_STR(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = ARRAY_SIZE(_choices), \ + .choices = _choices +#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn + +#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ + [Opt_##_name] = { \ + .attr = { \ + .name = #_name, \ + .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + }, \ + .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ + .get_sb = _sb_opt, \ + .set_sb = SET_##_sb_opt, \ + _type \ + }, + + BCH_OPTS() +#undef x +}; + +int bch2_opt_lookup(const char *name) +{ + const struct bch_option *i; + + for (i = bch2_opt_table; + i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); + i++) + if (!strcmp(name, i->attr.name)) + return i - bch2_opt_table; + + return -1; +} + +struct synonym { + const char *s1, *s2; +}; + +static const struct synonym bch_opt_synonyms[] = { + { "quota", "usrquota" }, +}; + +static int bch2_mount_opt_lookup(const char *name) +{ + const struct synonym *i; + + for (i = bch_opt_synonyms; + i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + i++) + if (!strcmp(name, i->s1)) + name = i->s2; + + return bch2_opt_lookup(name); +} + +int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) +{ + if (v < opt->min) { + if (err) + prt_printf(err, "%s: too small (min %llu)", + opt->attr.name, opt->min); + return -BCH_ERR_ERANGE_option_too_small; + } + + if (opt->max && v >= opt->max) { + if (err) + prt_printf(err, "%s: too big (max %llu)", + opt->attr.name, opt->max); + return -BCH_ERR_ERANGE_option_too_big; + } + + if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { + if (err) + prt_printf(err, "%s: not a multiple of 512", + opt->attr.name); + return -EINVAL; + } + + if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { + if (err) + prt_printf(err, "%s: must be a power of two", + opt->attr.name); + return -EINVAL; + } + + if (opt->fn.validate) + return opt->fn.validate(v, err); + + return 0; +} + +int bch2_opt_parse(struct bch_fs *c, + const struct bch_option *opt, + const char *val, u64 *res, + struct printbuf *err) +{ + ssize_t ret; + + switch (opt->type) { + case BCH_OPT_BOOL: + if (val) { + ret = kstrtou64(val, 10, res); + } else { + ret = 0; + *res = 1; + } + + if (ret < 0 || (*res != 0 && *res != 1)) { + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; + } + break; + case BCH_OPT_UINT: + if (!val) { + prt_printf(err, "%s: required value", + opt->attr.name); + return -EINVAL; + } + + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); + if (ret < 0) { + if (err) + prt_printf(err, "%s: must be a number", + opt->attr.name); + return ret; + } + break; + case BCH_OPT_STR: + if (!val) { + prt_printf(err, "%s: required value", + opt->attr.name); + return -EINVAL; + } + + ret = match_string(opt->choices, -1, val); + if (ret < 0) { + if (err) + prt_printf(err, "%s: invalid selection", + opt->attr.name); + return ret; + } + + *res = ret; + break; + case BCH_OPT_FN: + ret = opt->fn.parse(c, val, res, err); + if (ret < 0) { + if (err) + prt_printf(err, "%s: parse error", + opt->attr.name); + return ret; + } + } + + return bch2_opt_validate(opt, *res, err); +} + +void bch2_opt_to_text(struct printbuf *out, + struct bch_fs *c, struct bch_sb *sb, + const struct bch_option *opt, u64 v, + unsigned flags) +{ + if (flags & OPT_SHOW_MOUNT_STYLE) { + if (opt->type == BCH_OPT_BOOL) { + prt_printf(out, "%s%s", + v ? "" : "no", + opt->attr.name); + return; + } + + prt_printf(out, "%s=", opt->attr.name); + } + + switch (opt->type) { + case BCH_OPT_BOOL: + case BCH_OPT_UINT: + if (opt->flags & OPT_HUMAN_READABLE) + prt_human_readable_u64(out, v); + else + prt_printf(out, "%lli", v); + break; + case BCH_OPT_STR: + if (flags & OPT_SHOW_FULL_LIST) + prt_string_option(out, opt->choices, v); + else + prt_str(out, opt->choices[v]); + break; + case BCH_OPT_FN: + opt->fn.to_text(out, c, sb, v); + break; + default: + BUG(); + } +} + +int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) +{ + int ret = 0; + + switch (id) { + case Opt_compression: + case Opt_background_compression: + ret = bch2_check_set_has_compressed_data(c, v); + break; + case Opt_erasure_code: + if (v) + bch2_check_set_feature(c, BCH_FEATURE_ec); + break; + } + + return ret; +} + +int bch2_opts_check_may_set(struct bch_fs *c) +{ + unsigned i; + int ret; + + for (i = 0; i < bch2_opts_nr; i++) { + ret = bch2_opt_check_may_set(c, i, + bch2_opt_get_by_id(&c->opts, i)); + if (ret) + return ret; + } + + return 0; +} + +int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + char *options) +{ + char *copied_opts, *copied_opts_start; + char *opt, *name, *val; + int ret, id; + struct printbuf err = PRINTBUF; + u64 v; + + if (!options) + return 0; + + /* + * sys_fsconfig() is now occasionally providing us with option lists + * starting with a comma - weird. + */ + if (*options == ',') + options++; + + copied_opts = kstrdup(options, GFP_KERNEL); + if (!copied_opts) + return -1; + copied_opts_start = copied_opts; + + while ((opt = strsep(&copied_opts, ",")) != NULL) { + name = strsep(&opt, "="); + val = opt; + + id = bch2_mount_opt_lookup(name); + + /* Check for the form "noopt", negation of a boolean opt: */ + if (id < 0 && + !val && + !strncmp("no", name, 2)) { + id = bch2_mount_opt_lookup(name + 2); + val = "0"; + } + + /* Unknown options are ignored: */ + if (id < 0) + continue; + + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + goto bad_opt; + + if (id == Opt_acl && + !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) + goto bad_opt; + + if ((id == Opt_usrquota || + id == Opt_grpquota) && + !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) + goto bad_opt; + + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret < 0) + goto bad_val; + + bch2_opt_set_by_id(opts, id, v); + } + + ret = 0; + goto out; + +bad_opt: + pr_err("Bad mount option %s", name); + ret = -1; + goto out; +bad_val: + pr_err("Invalid mount option %s", err.buf); + ret = -1; + goto out; +out: + kfree(copied_opts_start); + printbuf_exit(&err); + return ret; +} + +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +{ + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + v = opt->get_sb(sb); + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = 1ULL << v; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v <<= 9; + + return v; +} + +/* + * Initial options from superblock - here we don't want any options undefined, + * any options the superblock doesn't specify are set to 0: + */ +int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) +{ + unsigned id; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + + if (opt->get_sb == BCH2_NO_SB_OPT) + continue; + + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); + } + + return 0; +} + +void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_BCH2_NO_SB_OPT) + return; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v >>= 9; + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = ilog2(v); + + opt->set_sb(sb, v); +} + +void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_BCH2_NO_SB_OPT) + return; + + mutex_lock(&c->sb_lock); + __bch2_opt_set_sb(c->disk_sb.sb, opt, v); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + +/* io opts: */ + +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) +{ + return (struct bch_io_opts) { +#define x(_name, _bits) ._name = src._name, + BCH_INODE_OPTS() +#undef x + }; +} + +bool bch2_opt_is_inode_opt(enum bch_opt_id id) +{ + static const enum bch_opt_id inode_opt_list[] = { +#define x(_name, _bits) Opt_##_name, + BCH_INODE_OPTS() +#undef x + }; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) + if (inode_opt_list[i] == id) + return true; + + return false; +} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h new file mode 100644 index 000000000000..8526f177450a --- /dev/null +++ b/fs/bcachefs/opts.h @@ -0,0 +1,564 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_OPTS_H +#define _BCACHEFS_OPTS_H + +#include <linux/bug.h> +#include <linux/log2.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include "bcachefs_format.h" + +struct bch_fs; + +extern const char * const bch2_error_actions[]; +extern const char * const bch2_fsck_fix_opts[]; +extern const char * const bch2_version_upgrade_opts[]; +extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; +extern const char * const __bch2_btree_ids[]; +extern const char * const bch2_csum_types[]; +extern const char * const bch2_csum_opts[]; +extern const char * const bch2_compression_types[]; +extern const char * const bch2_compression_opts[]; +extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_str_hash_opts[]; +extern const char * const bch2_data_types[]; +extern const char * const bch2_member_states[]; +extern const char * const bch2_jset_entry_types[]; +extern const char * const bch2_fs_usage_types[]; +extern const char * const bch2_d_types[]; + +static inline const char *bch2_d_type_str(unsigned d_type) +{ + return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; +} + +/* + * Mount options; we also store defaults in the superblock. + * + * Also exposed via sysfs: if an option is writeable, and it's also stored in + * the superblock, changing it via sysfs (currently? might change this) also + * updates the superblock. + * + * We store options as signed integers, where -1 means undefined. This means we + * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only + * apply the options from that struct that are defined. + */ + +/* dummy option, for options that aren't stored in the superblock */ +u64 BCH2_NO_SB_OPT(const struct bch_sb *); +void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); + +/* When can be set: */ +enum opt_flags { + OPT_FS = (1 << 0), /* Filesystem option */ + OPT_DEVICE = (1 << 1), /* Device option */ + OPT_INODE = (1 << 2), /* Inode option */ + OPT_FORMAT = (1 << 3), /* May be specified at format time */ + OPT_MOUNT = (1 << 4), /* May be specified at mount time */ + OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ + OPT_HUMAN_READABLE = (1 << 6), + OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ + OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ + OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ +}; + +enum opt_type { + BCH_OPT_BOOL, + BCH_OPT_UINT, + BCH_OPT_STR, + BCH_OPT_FN, +}; + +struct bch_opt_fn { + int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + int (*validate)(u64, struct printbuf *); +}; + +/** + * x(name, shortopt, type, in mem type, mode, sb_opt) + * + * @name - name of mount option, sysfs attribute, and struct bch_opts + * member + * + * @mode - when opt may be set + * + * @sb_option - name of corresponding superblock option + * + * @type - one of OPT_BOOL, OPT_UINT, OPT_STR + */ + +/* + * XXX: add fields for + * - default value + * - helptext + */ + +#ifdef __KERNEL__ +#define RATELIMIT_ERRORS_DEFAULT true +#else +#define RATELIMIT_ERRORS_DEFAULT false +#endif + +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCACHEFS_VERBOSE_DEFAULT true +#else +#define BCACHEFS_VERBOSE_DEFAULT false +#endif + +#define BCH_FIX_ERRORS_OPTS() \ + x(exit, 0) \ + x(yes, 1) \ + x(no, 2) \ + x(ask, 3) + +enum fsck_err_opts { +#define x(t, n) FSCK_FIX_##t, + BCH_FIX_ERRORS_OPTS() +#undef x +}; + +#define BCH_OPTS() \ + x(block_size, u16, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 16), \ + BCH_SB_BLOCK_SIZE, 8, \ + "size", NULL) \ + x(btree_node_size, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 20), \ + BCH_SB_BTREE_NODE_SIZE, 512, \ + "size", "Btree node size, default 256k") \ + x(errors, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_error_actions), \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + NULL, "Action to take on filesystem error") \ + x(metadata_replicas, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_WANT, 1, \ + "#", "Number of metadata replicas") \ + x(data_replicas, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_WANT, 1, \ + "#", "Number of data replicas") \ + x(metadata_replicas_required, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(data_replicas_required, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(encoded_extent_max, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ + OPT_UINT(4096, 2U << 20), \ + BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ + "size", "Maximum size of checksummed/compressed extents")\ + x(metadata_checksum, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(data_checksum, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(compression, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_compression), \ + BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(background_compression, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_compression), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(str_hash, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_str_hash_opts), \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ + NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_METADATA_TARGET, 0, \ + "(target)", "Device or label for metadata writes") \ + x(foreground_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_FOREGROUND_TARGET, 0, \ + "(target)", "Device or label for foreground writes") \ + x(background_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_BACKGROUND_TARGET, 0, \ + "(target)", "Device or label to move data to in the background")\ + x(promote_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_PROMOTE_TARGET, 0, \ + "(target)", "Device or label to promote data to on read") \ + x(erasure_code, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_ERASURE_CODE, false, \ + NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(inodes_32bit, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_INODE_32BIT, true, \ + NULL, "Constrain inode numbers to 32 bits") \ + x(shard_inode_numbers, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_SHARD_INUMS, true, \ + NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_INODES_USE_KEY_CACHE, true, \ + NULL, "Use the btree key cache for the inodes btree") \ + x(btree_node_mem_ptr_optimization, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(btree_write_buffer_size, u32, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(16, (1U << 20) - 1), \ + BCH2_NO_SB_OPT, 1U << 13, \ + NULL, "Number of btree write buffer entries") \ + x(gc_reserve_percent, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ + BCH_SB_GC_RESERVE, 8, \ + "%", "Percentage of disk space to reserve for copygc")\ + x(gc_reserve_bytes, u64, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ + OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(0, U64_MAX), \ + BCH_SB_GC_RESERVE_BYTES, 0, \ + "%", "Amount of disk space to reserve for copygc\n" \ + "Takes precedence over gc_reserve_percent if set")\ + x(root_reserve_percent, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(0, 100), \ + BCH_SB_ROOT_RESERVE, 0, \ + "%", "Percentage of disk space to reserve for superuser")\ + x(wide_macs, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_128_BIT_MACS, false, \ + NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ + x(inline_data, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable inline data extents") \ + x(acl, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_POSIX_ACL, true, \ + NULL, "Enable POSIX acls") \ + x(usrquota, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_USRQUOTA, false, \ + NULL, "Enable user quotas") \ + x(grpquota, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_GRPQUOTA, false, \ + NULL, "Enable group quotas") \ + x(prjquota, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ + x(degraded, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in degraded mode") \ + x(very_degraded, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in when data will be missing") \ + x(discard, u8, \ + OPT_FS|OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable discard/TRIM support") \ + x(verbose, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ + NULL, "Extra debugging information during mount/recovery")\ + x(journal_flush_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, U32_MAX), \ + BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ + NULL, "Delay in milliseconds before automatic journal commits")\ + x(journal_flush_disabled, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ + NULL, "Disable journal flush on sync/fsync\n" \ + "If enabled, writes can be lost, but only since the\n"\ + "last journal write (default 1 second)") \ + x(journal_reclaim_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U32_MAX), \ + BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ + NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(move_bytes_in_flight, u32, \ + OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1024, U32_MAX), \ + BCH2_NO_SB_OPT, 1U << 20, \ + NULL, "Maximum Amount of IO to keep in flight by the move path")\ + x(move_ios_in_flight, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 1024), \ + BCH2_NO_SB_OPT, 32, \ + NULL, "Maximum number of IOs to keep in flight by the move path")\ + x(fsck, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Run fsck on mount") \ + x(fix_errors, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_FN(bch2_opt_fix_errors), \ + BCH2_NO_SB_OPT, FSCK_FIX_exit, \ + NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ + NULL, "Ratelimit error messages during fsck") \ + x(nochanges, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Super read only mode - no writes at all will be issued,\n"\ + "even if we have to replay the journal") \ + x(norecovery, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ + x(keep_journal, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't free journal entries/keys after startup")\ + x(read_entire_journal, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Read all journal entries, not just dirty ones")\ + x(read_journal_only, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Only read the journal, skip the rest of recovery")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ + NULL, "Log transaction function names in journal") \ + x(noexcl, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't open device in exclusive mode") \ + x(direct_io, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Use O_DIRECT (userspace only)") \ + x(sb, u64, \ + OPT_MOUNT, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ + "offset", "Sector offset of superblock") \ + x(read_only, u8, \ + OPT_FS, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(nostart, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don\'t start filesystem, only open devices") \ + x(reconstruct_alloc, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Reconstruct alloc btree") \ + x(version_upgrade, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_STR(bch2_version_upgrade_opts), \ + BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ + NULL, "Set superblock to latest version,\n" \ + "allowing any new features to be used") \ + x(buckets_nouse, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allocate the buckets_nouse bitmap") \ + x(project, u8, \ + OPT_INODE, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(nocow, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_BOOL(), \ + BCH_SB_NOCOW, false, \ + NULL, "Nocow mode: Writes will be done in place when possible.\n"\ + "Snapshots and reflink will still caused writes to be COW\n"\ + "Implicitly disables data checksumming, compression and encryption")\ + x(nocow_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable nocow mode: enables runtime locking in\n"\ + "data move path needed if nocow will ever be in use\n")\ + x(no_data_io, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Skip submit_bio() for data reads and writes, " \ + "for performance testing purposes") \ + x(fs_size, u64, \ + OPT_DEVICE, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(bucket, u32, \ + OPT_DEVICE, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(durability, u8, \ + OPT_DEVICE, \ + OPT_UINT(0, BCH_REPLICAS_MAX), \ + BCH2_NO_SB_OPT, 1, \ + "n", "Data written to this device will be considered\n"\ + "to have already been replicated n times") + +struct bch_opts { +#define x(_name, _bits, ...) unsigned _name##_defined:1; + BCH_OPTS() +#undef x + +#define x(_name, _bits, ...) _bits _name; + BCH_OPTS() +#undef x +}; + +static const __maybe_unused struct bch_opts bch2_opts_default = { +#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ + ._name##_defined = true, \ + ._name = _default, \ + + BCH_OPTS() +#undef x +}; + +#define opt_defined(_opts, _name) ((_opts)._name##_defined) + +#define opt_get(_opts, _name) \ + (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) + +#define opt_set(_opts, _name, _v) \ +do { \ + (_opts)._name##_defined = true; \ + (_opts)._name = _v; \ +} while (0) + +static inline struct bch_opts bch2_opts_empty(void) +{ + return (struct bch_opts) { 0 }; +} + +void bch2_opts_apply(struct bch_opts *, struct bch_opts); + +enum bch_opt_id { +#define x(_name, ...) Opt_##_name, + BCH_OPTS() +#undef x + bch2_opts_nr +}; + +struct bch_fs; +struct printbuf; + +struct bch_option { + struct attribute attr; + u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); + enum opt_type type; + enum opt_flags flags; + u64 min, max; + + const char * const *choices; + + struct bch_opt_fn fn; + + const char *hint; + const char *help; + +}; + +extern const struct bch_option bch2_opt_table[]; + +bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); +u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); +void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); + +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); +int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); +void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); +void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); + +int bch2_opt_lookup(const char *); +int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); +int bch2_opt_parse(struct bch_fs *, const struct bch_option *, + const char *, u64 *, struct printbuf *); + +#define OPT_SHOW_FULL_LIST (1 << 0) +#define OPT_SHOW_MOUNT_STYLE (1 << 1) + +void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, + const struct bch_option *, u64, unsigned); + +int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opts_check_may_set(struct bch_fs *); +int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); + +/* inode opts: */ + +struct bch_io_opts { +#define x(_name, _bits) u##_bits _name; + BCH_INODE_OPTS() +#undef x +}; + +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); +bool bch2_opt_is_inode_opt(enum bch_opt_id); + +#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c new file mode 100644 index 000000000000..5e653eb81d54 --- /dev/null +++ b/fs/bcachefs/printbuf.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: LGPL-2.1+ +/* Copyright (C) 2022 Kent Overstreet */ + +#include <linux/err.h> +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string_helpers.h> + +#include "printbuf.h" + +static inline unsigned printbuf_linelen(struct printbuf *buf) +{ + return buf->pos - buf->last_newline; +} + +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +{ + unsigned new_size; + char *buf; + + if (!out->heap_allocated) + return 0; + + /* Reserved space for terminating nul: */ + extra += 1; + + if (out->pos + extra < out->size) + return 0; + + new_size = roundup_pow_of_two(out->size + extra); + + /* + * Note: output buffer must be freeable with kfree(), it's not required + * that the user use printbuf_exit(). + */ + buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); + + if (!buf) { + out->allocation_failure = true; + return -ENOMEM; + } + + out->buf = buf; + out->size = new_size; + return 0; +} + +void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) +{ + int len; + + do { + va_list args2; + + va_copy(args2, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) +{ + va_list args; + int len; + + do { + va_start(args, fmt); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + va_end(args); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +/** + * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be + * null terminated + * @buf: printbuf to terminate + * Returns: Printbuf contents, as a nul terminated C string + */ +const char *bch2_printbuf_str(const struct printbuf *buf) +{ + /* + * If we've written to a printbuf then it's guaranteed to be a null + * terminated string - but if we haven't, then we might not have + * allocated a buffer at all: + */ + return buf->pos + ? buf->buf + : ""; +} + +/** + * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it + * against accidental use. + * @buf: printbuf to exit + */ +void bch2_printbuf_exit(struct printbuf *buf) +{ + if (buf->heap_allocated) { + kfree(buf->buf); + buf->buf = ERR_PTR(-EINTR); /* poison value */ + } +} + +void bch2_printbuf_tabstops_reset(struct printbuf *buf) +{ + buf->nr_tabstops = 0; +} + +void bch2_printbuf_tabstop_pop(struct printbuf *buf) +{ + if (buf->nr_tabstops) + --buf->nr_tabstops; +} + +/* + * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop + * + * @buf: printbuf to control + * @spaces: number of spaces from previous tabpstop + * + * In the future this function may allocate memory if setting more than + * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start + * of line. + */ +int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) +{ + unsigned prev_tabstop = buf->nr_tabstops + ? buf->_tabstops[buf->nr_tabstops - 1] + : 0; + + if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) + return -EINVAL; + + buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; + buf->has_indent_or_tabstops = true; + return 0; +} + +/** + * bch2_printbuf_indent_add() - add to the current indent level + * + * @buf: printbuf to control + * @spaces: number of spaces to add to the current indent level + * + * Subsequent lines, and the current line if the output position is at the start + * of the current line, will be indented by @spaces more spaces. + */ +void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) + spaces = 0; + + buf->indent += spaces; + prt_chars(buf, ' ', spaces); + + buf->has_indent_or_tabstops = true; +} + +/** + * bch2_printbuf_indent_sub() - subtract from the current indent level + * + * @buf: printbuf to control + * @spaces: number of spaces to subtract from the current indent level + * + * Subsequent lines, and the current line if the output position is at the start + * of the current line, will be indented by @spaces less spaces. + */ +void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(spaces > buf->indent)) + spaces = buf->indent; + + if (buf->last_newline + buf->indent == buf->pos) { + buf->pos -= spaces; + printbuf_nul_terminate(buf); + } + buf->indent -= spaces; + + if (!buf->indent && !buf->nr_tabstops) + buf->has_indent_or_tabstops = false; +} + +void bch2_prt_newline(struct printbuf *buf) +{ + unsigned i; + + bch2_printbuf_make_room(buf, 1 + buf->indent); + + __prt_char(buf, '\n'); + + buf->last_newline = buf->pos; + + for (i = 0; i < buf->indent; i++) + __prt_char(buf, ' '); + + printbuf_nul_terminate(buf); + + buf->last_field = buf->pos; + buf->cur_tabstop = 0; +} + +/* + * Returns spaces from start of line, if set, or 0 if unset: + */ +static inline unsigned cur_tabstop(struct printbuf *buf) +{ + return buf->cur_tabstop < buf->nr_tabstops + ? buf->_tabstops[buf->cur_tabstop] + : 0; +} + +static void __prt_tab(struct printbuf *out) +{ + int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); + + prt_chars(out, ' ', spaces); + + out->last_field = out->pos; + out->cur_tabstop++; +} + +/** + * bch2_prt_tab() - Advance printbuf to the next tabstop + * @out: printbuf to control + * + * Advance output to the next tabstop by printing spaces. + */ +void bch2_prt_tab(struct printbuf *out) +{ + if (WARN_ON(!cur_tabstop(out))) + return; + + __prt_tab(out); +} + +static void __prt_tab_rjust(struct printbuf *buf) +{ + unsigned move = buf->pos - buf->last_field; + int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); + + if (pad > 0) { + bch2_printbuf_make_room(buf, pad); + + if (buf->last_field + pad < buf->size) + memmove(buf->buf + buf->last_field + pad, + buf->buf + buf->last_field, + min(move, buf->size - 1 - buf->last_field - pad)); + + if (buf->last_field < buf->size) + memset(buf->buf + buf->last_field, ' ', + min((unsigned) pad, buf->size - buf->last_field)); + + buf->pos += pad; + printbuf_nul_terminate(buf); + } + + buf->last_field = buf->pos; + buf->cur_tabstop++; +} + +/** + * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying + * previous output + * + * @buf: printbuf to control + * + * Advance output to the next tabstop by inserting spaces immediately after the + * previous tabstop, right justifying previously outputted text. + */ +void bch2_prt_tab_rjust(struct printbuf *buf) +{ + if (WARN_ON(!cur_tabstop(buf))) + return; + + __prt_tab_rjust(buf); +} + +/** + * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters + * + * @out: output printbuf + * @str: string to print + * @count: number of bytes to print + * + * The following contol characters are handled as so: + * \n: prt_newline newline that obeys current indent level + * \t: prt_tab advance to next tabstop + * \r: prt_tab_rjust advance to next tabstop, with right justification + */ +void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) +{ + const char *unprinted_start = str; + const char *end = str + count; + + if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { + prt_bytes(out, str, count); + return; + } + + while (str != end) { + switch (*str) { + case '\n': + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + bch2_prt_newline(out); + break; + case '\t': + if (likely(cur_tabstop(out))) { + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + __prt_tab(out); + } + break; + case '\r': + if (likely(cur_tabstop(out))) { + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + __prt_tab_rjust(out); + } + break; + } + + str++; + } + + prt_bytes(out, unprinted_start, str - unprinted_start); +} + +/** + * bch2_prt_human_readable_u64() - Print out a u64 in human readable units + * @out: output printbuf + * @v: integer to print + * + * Units of 2^10 (default) or 10^3 are controlled via @out->si_units + */ +void bch2_prt_human_readable_u64(struct printbuf *out, u64 v) +{ + bch2_printbuf_make_room(out, 10); + out->pos += string_get_size(v, 1, !out->si_units, + out->buf + out->pos, + printbuf_remaining_size(out)); +} + +/** + * bch2_prt_human_readable_s64() - Print out a s64 in human readable units + * @out: output printbuf + * @v: integer to print + * + * Units of 2^10 (default) or 10^3 are controlled via @out->si_units + */ +void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) +{ + if (v < 0) + prt_char(out, '-'); + bch2_prt_human_readable_u64(out, abs(v)); +} + +/** + * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options + * @out: output printbuf + * @v: integer to print + * + * Units are either raw (default), or human reabable units (controlled via + * @buf->human_readable_units) + */ +void bch2_prt_units_u64(struct printbuf *out, u64 v) +{ + if (out->human_readable_units) + bch2_prt_human_readable_u64(out, v); + else + bch2_prt_printf(out, "%llu", v); +} + +/** + * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options + * @out: output printbuf + * @v: integer to print + * + * Units are either raw (default), or human reabable units (controlled via + * @buf->human_readable_units) + */ +void bch2_prt_units_s64(struct printbuf *out, s64 v) +{ + if (v < 0) + prt_char(out, '-'); + bch2_prt_units_u64(out, abs(v)); +} + +void bch2_prt_string_option(struct printbuf *out, + const char * const list[], + size_t selected) +{ + size_t i; + + for (i = 0; list[i]; i++) + bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); +} + +void bch2_prt_bitflags(struct printbuf *out, + const char * const list[], u64 flags) +{ + unsigned bit, nr = 0; + bool first = true; + + while (list[nr]) + nr++; + + while (flags && (bit = __ffs64(flags)) < nr) { + if (!first) + bch2_prt_printf(out, ","); + first = false; + bch2_prt_printf(out, "%s", list[bit]); + flags ^= BIT_ULL(bit); + } +} diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h new file mode 100644 index 000000000000..2191423d9f22 --- /dev/null +++ b/fs/bcachefs/printbuf.h @@ -0,0 +1,284 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* Copyright (C) 2022 Kent Overstreet */ + +#ifndef _BCACHEFS_PRINTBUF_H +#define _BCACHEFS_PRINTBUF_H + +/* + * Printbufs: Simple strings for printing to, with optional heap allocation + * + * This code has provisions for use in userspace, to aid in making other code + * portable between kernelspace and userspace. + * + * Basic example: + * struct printbuf buf = PRINTBUF; + * + * prt_printf(&buf, "foo="); + * foo_to_text(&buf, foo); + * printk("%s", buf.buf); + * printbuf_exit(&buf); + * + * Or + * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) + * + * We can now write pretty printers instead of writing code that dumps + * everything to the kernel log buffer, and then those pretty-printers can be + * used by other code that outputs to kernel log, sysfs, debugfs, etc. + * + * Memory allocation: Outputing to a printbuf may allocate memory. This + * allocation is done with GFP_KERNEL, by default: use the newer + * memalloc_*_(save|restore) functions as needed. + * + * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations + * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. + * + * It's allowed to grab the output buffer and free it later with kfree() instead + * of using printbuf_exit(), if the user just needs a heap allocated string at + * the end. + * + * Memory allocation failures: We don't return errors directly, because on + * memory allocation failure we usually don't want to bail out and unwind - we + * want to print what we've got, on a best-effort basis. But code that does want + * to return -ENOMEM may check printbuf.allocation_failure. + * + * Indenting, tabstops: + * + * To aid is writing multi-line pretty printers spread across multiple + * functions, printbufs track the current indent level. + * + * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent + * level, respectively. + * + * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from + * start of line. Once set, prt_tab() will output spaces up to the next tabstop. + * prt_tab_rjust() will also advance the current line of text up to the next + * tabstop, but it does so by shifting text since the previous tabstop up to the + * next tabstop - right justifying it. + * + * Make sure you use prt_newline() instead of \n in the format string for indent + * level and tabstops to work corretly. + * + * Output units: printbuf->units exists to tell pretty-printers how to output + * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as + * human readable bytes. prt_units() obeys it. + */ + +#include <linux/kernel.h> +#include <linux/string.h> + +enum printbuf_si { + PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ + PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ +}; + +#define PRINTBUF_INLINE_TABSTOPS 6 + +struct printbuf { + char *buf; + unsigned size; + unsigned pos; + unsigned last_newline; + unsigned last_field; + unsigned indent; + /* + * If nonzero, allocations will be done with GFP_ATOMIC: + */ + u8 atomic; + bool allocation_failure:1; + bool heap_allocated:1; + enum printbuf_si si_units:1; + bool human_readable_units:1; + bool has_indent_or_tabstops:1; + bool suppress_indent_tabstop_handling:1; + u8 nr_tabstops; + + /* + * Do not modify directly: use printbuf_tabstop_add(), + * printbuf_tabstop_get() + */ + u8 cur_tabstop; + u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; +}; + +int bch2_printbuf_make_room(struct printbuf *, unsigned); +__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); +__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); +const char *bch2_printbuf_str(const struct printbuf *); +void bch2_printbuf_exit(struct printbuf *); + +void bch2_printbuf_tabstops_reset(struct printbuf *); +void bch2_printbuf_tabstop_pop(struct printbuf *); +int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); + +void bch2_printbuf_indent_add(struct printbuf *, unsigned); +void bch2_printbuf_indent_sub(struct printbuf *, unsigned); + +void bch2_prt_newline(struct printbuf *); +void bch2_prt_tab(struct printbuf *); +void bch2_prt_tab_rjust(struct printbuf *); + +void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); +void bch2_prt_human_readable_u64(struct printbuf *, u64); +void bch2_prt_human_readable_s64(struct printbuf *, s64); +void bch2_prt_units_u64(struct printbuf *, u64); +void bch2_prt_units_s64(struct printbuf *, s64); +void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); +void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); + +/* Initializer for a heap allocated printbuf: */ +#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) + +/* Initializer a printbuf that points to an external buffer: */ +#define PRINTBUF_EXTERN(_buf, _size) \ +((struct printbuf) { \ + .buf = _buf, \ + .size = _size, \ +}) + +/* + * Returns size remaining of output buffer: + */ +static inline unsigned printbuf_remaining_size(struct printbuf *out) +{ + return out->pos < out->size ? out->size - out->pos : 0; +} + +/* + * Returns number of characters we can print to the output buffer - i.e. + * excluding the terminating nul: + */ +static inline unsigned printbuf_remaining(struct printbuf *out) +{ + return out->pos < out->size ? out->size - out->pos - 1 : 0; +} + +static inline unsigned printbuf_written(struct printbuf *out) +{ + return out->size ? min(out->pos, out->size - 1) : 0; +} + +/* + * Returns true if output was truncated: + */ +static inline bool printbuf_overflowed(struct printbuf *out) +{ + return out->pos >= out->size; +} + +static inline void printbuf_nul_terminate(struct printbuf *out) +{ + bch2_printbuf_make_room(out, 1); + + if (out->pos < out->size) + out->buf[out->pos] = 0; + else if (out->size) + out->buf[out->size - 1] = 0; +} + +/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ +static inline void __prt_char_reserved(struct printbuf *out, char c) +{ + if (printbuf_remaining(out)) + out->buf[out->pos] = c; + out->pos++; +} + +/* Doesn't nul terminate: */ +static inline void __prt_char(struct printbuf *out, char c) +{ + bch2_printbuf_make_room(out, 1); + __prt_char_reserved(out, c); +} + +static inline void prt_char(struct printbuf *out, char c) +{ + __prt_char(out, c); + printbuf_nul_terminate(out); +} + +static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) +{ + unsigned i, can_print = min(n, printbuf_remaining(out)); + + for (i = 0; i < can_print; i++) + out->buf[out->pos++] = c; + out->pos += n - can_print; +} + +static inline void prt_chars(struct printbuf *out, char c, unsigned n) +{ + bch2_printbuf_make_room(out, n); + __prt_chars_reserved(out, c, n); + printbuf_nul_terminate(out); +} + +static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) +{ + unsigned i, can_print; + + bch2_printbuf_make_room(out, n); + + can_print = min(n, printbuf_remaining(out)); + + for (i = 0; i < can_print; i++) + out->buf[out->pos++] = ((char *) b)[i]; + out->pos += n - can_print; + + printbuf_nul_terminate(out); +} + +static inline void prt_str(struct printbuf *out, const char *str) +{ + prt_bytes(out, str, strlen(str)); +} + +static inline void prt_str_indented(struct printbuf *out, const char *str) +{ + bch2_prt_bytes_indented(out, str, strlen(str)); +} + +static inline void prt_hex_byte(struct printbuf *out, u8 byte) +{ + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, hex_asc_hi(byte)); + __prt_char_reserved(out, hex_asc_lo(byte)); + printbuf_nul_terminate(out); +} + +static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) +{ + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, hex_asc_upper_hi(byte)); + __prt_char_reserved(out, hex_asc_upper_lo(byte)); + printbuf_nul_terminate(out); +} + +/** + * printbuf_reset - re-use a printbuf without freeing and re-initializing it: + */ +static inline void printbuf_reset(struct printbuf *buf) +{ + buf->pos = 0; + buf->allocation_failure = 0; + buf->indent = 0; + buf->nr_tabstops = 0; + buf->cur_tabstop = 0; +} + +/** + * printbuf_atomic_inc - mark as entering an atomic section + */ +static inline void printbuf_atomic_inc(struct printbuf *buf) +{ + buf->atomic++; +} + +/** + * printbuf_atomic_inc - mark as leaving an atomic section + */ +static inline void printbuf_atomic_dec(struct printbuf *buf) +{ + buf->atomic--; +} + +#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 index 000000000000..a54647c36b85 --- /dev/null +++ b/fs/bcachefs/quota.c @@ -0,0 +1,979 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "errcode.h" +#include "error.h" +#include "inode.h" +#include "quota.h" +#include "snapshot.h" +#include "super-io.h" + +static const char * const bch2_quota_types[] = { + "user", + "group", + "project", +}; + +static const char * const bch2_quota_counters[] = { + "space", + "inodes", +}; + +static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + + if (vstruct_bytes(&q->field) < sizeof(*q)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&q->field), sizeof(*q)); + return -BCH_ERR_invalid_sb_quota; + } + + return 0; +} + +static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + unsigned qtyp, counter; + + for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { + prt_printf(out, "%s: flags %llx", + bch2_quota_types[qtyp], + le64_to_cpu(q->q[qtyp].flags)); + + for (counter = 0; counter < Q_COUNTERS; counter++) + prt_printf(out, " %s timelimit %u warnlimit %u", + bch2_quota_counters[counter], + le32_to_cpu(q->q[qtyp].c[counter].timelimit), + le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); + + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_quota = { + .validate = bch2_sb_quota_validate, + .to_text = bch2_sb_quota_to_text, +}; + +int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err, + quota_type_invalid, + "invalid quota type (%llu >= %u)", + k.k->p.inode, QTYP_NR); +fsck_err: + return ret; +} + +void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); + unsigned i; + + for (i = 0; i < Q_COUNTERS; i++) + prt_printf(out, "%s hardlimit %llu softlimit %llu", + bch2_quota_counters[i], + le64_to_cpu(dq.v->c[i].hardlimit), + le64_to_cpu(dq.v->c[i].softlimit)); +} + +#ifdef CONFIG_BCACHEFS_QUOTA + +#include <linux/cred.h> +#include <linux/fs.h> +#include <linux/quota.h> + +static void qc_info_to_text(struct printbuf *out, struct qc_info *i) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "i_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", i->i_fieldmask); + prt_newline(out); + + prt_str(out, "i_flags"); + prt_tab(out); + prt_printf(out, "%u", i->i_flags); + prt_newline(out); + + prt_str(out, "i_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_ino_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_timelimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_warnlimit); + prt_newline(out); + + prt_str(out, "i_ino_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_warnlimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_warnlimit); + prt_newline(out); +} + +static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "d_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", q->d_fieldmask); + prt_newline(out); + + prt_str(out, "d_spc_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_hardlimit); + prt_newline(out); + + prt_str(out, "d_spc_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_softlimit); + prt_newline(out); + + prt_str(out, "d_ino_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_hardlimit); + prt_newline(out); + + prt_str(out, "d_ino_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_softlimit); + prt_newline(out); + + prt_str(out, "d_space"); + prt_tab(out); + prt_printf(out, "%llu", q->d_space); + prt_newline(out); + + prt_str(out, "d_ino_count"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_count); + prt_newline(out); + + prt_str(out, "d_ino_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_timer); + prt_newline(out); + + prt_str(out, "d_spc_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_timer); + prt_newline(out); + + prt_str(out, "d_ino_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_ino_warns); + prt_newline(out); + + prt_str(out, "d_spc_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_spc_warns); + prt_newline(out); +} + +static inline unsigned __next_qtype(unsigned i, unsigned qtypes) +{ + qtypes >>= i; + return qtypes ? i + __ffs(qtypes) : QTYP_NR; +} + +#define for_each_set_qtype(_c, _i, _q, _qtypes) \ + for (_i = 0; \ + (_i = __next_qtype(_i, _qtypes), \ + _q = &(_c)->quotas[_i], \ + _i < QTYP_NR); \ + _i++) + +static bool ignore_hardlimit(struct bch_memquota_type *q) +{ + if (capable(CAP_SYS_RESOURCE)) + return true; +#if 0 + struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; + + return capable(CAP_SYS_RESOURCE) && + (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || + !(info->dqi_flags & DQF_ROOT_SQUASH)); +#endif + return false; +} + +enum quota_msg { + SOFTWARN, /* Softlimit reached */ + SOFTLONGWARN, /* Grace time expired */ + HARDWARN, /* Hardlimit reached */ + + HARDBELOW, /* Usage got below inode hardlimit */ + SOFTBELOW, /* Usage got below inode softlimit */ +}; + +static int quota_nl[][Q_COUNTERS] = { + [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, + [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, + [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, + [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, + [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, + + [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, + [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, + [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, + [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, + [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, +}; + +struct quota_msgs { + u8 nr; + struct { + u8 qtype; + u8 msg; + } m[QTYP_NR * Q_COUNTERS]; +}; + +static void prepare_msg(unsigned qtype, + enum quota_counters counter, + struct quota_msgs *msgs, + enum quota_msg msg_type) +{ + BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); + + msgs->m[msgs->nr].qtype = qtype; + msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; + msgs->nr++; +} + +static void prepare_warning(struct memquota_counter *qc, + unsigned qtype, + enum quota_counters counter, + struct quota_msgs *msgs, + enum quota_msg msg_type) +{ + if (qc->warning_issued & (1 << msg_type)) + return; + + prepare_msg(qtype, counter, msgs, msg_type); +} + +static void flush_warnings(struct bch_qid qid, + struct super_block *sb, + struct quota_msgs *msgs) +{ + unsigned i; + + for (i = 0; i < msgs->nr; i++) + quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), + sb->s_dev, msgs->m[i].msg); +} + +static int bch2_quota_check_limit(struct bch_fs *c, + unsigned qtype, + struct bch_memquota *mq, + struct quota_msgs *msgs, + enum quota_counters counter, + s64 v, + enum quota_acct_mode mode) +{ + struct bch_memquota_type *q = &c->quotas[qtype]; + struct memquota_counter *qc = &mq->c[counter]; + u64 n = qc->v + v; + + BUG_ON((s64) n < 0); + + if (mode == KEY_TYPE_QUOTA_NOCHECK) + return 0; + + if (v <= 0) { + if (n < qc->hardlimit && + (qc->warning_issued & (1 << HARDWARN))) { + qc->warning_issued &= ~(1 << HARDWARN); + prepare_msg(qtype, counter, msgs, HARDBELOW); + } + + if (n < qc->softlimit && + (qc->warning_issued & (1 << SOFTWARN))) { + qc->warning_issued &= ~(1 << SOFTWARN); + prepare_msg(qtype, counter, msgs, SOFTBELOW); + } + + qc->warning_issued = 0; + return 0; + } + + if (qc->hardlimit && + qc->hardlimit < n && + !ignore_hardlimit(q)) { + prepare_warning(qc, qtype, counter, msgs, HARDWARN); + return -EDQUOT; + } + + if (qc->softlimit && + qc->softlimit < n) { + if (qc->timer == 0) { + qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; + prepare_warning(qc, qtype, counter, msgs, SOFTWARN); + } else if (ktime_get_real_seconds() >= qc->timer && + !ignore_hardlimit(q)) { + prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); + return -EDQUOT; + } + } + + return 0; +} + +int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, + enum quota_counters counter, s64 v, + enum quota_acct_mode mode) +{ + unsigned qtypes = enabled_qtypes(c); + struct bch_memquota_type *q; + struct bch_memquota *mq[QTYP_NR]; + struct quota_msgs msgs; + unsigned i; + int ret = 0; + + memset(&msgs, 0, sizeof(msgs)); + + for_each_set_qtype(c, i, q, qtypes) { + mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL); + if (!mq[i]) + return -ENOMEM; + } + + for_each_set_qtype(c, i, q, qtypes) + mutex_lock_nested(&q->lock, i); + + for_each_set_qtype(c, i, q, qtypes) { + ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); + if (ret) + goto err; + } + + for_each_set_qtype(c, i, q, qtypes) + mq[i]->c[counter].v += v; +err: + for_each_set_qtype(c, i, q, qtypes) + mutex_unlock(&q->lock); + + flush_warnings(qid, c->vfs_sb, &msgs); + + return ret; +} + +static void __bch2_quota_transfer(struct bch_memquota *src_q, + struct bch_memquota *dst_q, + enum quota_counters counter, s64 v) +{ + BUG_ON(v > src_q->c[counter].v); + BUG_ON(v + dst_q->c[counter].v < v); + + src_q->c[counter].v -= v; + dst_q->c[counter].v += v; +} + +int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, + struct bch_qid dst, + struct bch_qid src, u64 space, + enum quota_acct_mode mode) +{ + struct bch_memquota_type *q; + struct bch_memquota *src_q[3], *dst_q[3]; + struct quota_msgs msgs; + unsigned i; + int ret = 0; + + qtypes &= enabled_qtypes(c); + + memset(&msgs, 0, sizeof(msgs)); + + for_each_set_qtype(c, i, q, qtypes) { + src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL); + dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL); + if (!src_q[i] || !dst_q[i]) + return -ENOMEM; + } + + for_each_set_qtype(c, i, q, qtypes) + mutex_lock_nested(&q->lock, i); + + for_each_set_qtype(c, i, q, qtypes) { + ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, + dst_q[i]->c[Q_SPC].v + space, + mode); + if (ret) + goto err; + + ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, + dst_q[i]->c[Q_INO].v + 1, + mode); + if (ret) + goto err; + } + + for_each_set_qtype(c, i, q, qtypes) { + __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); + __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); + } + +err: + for_each_set_qtype(c, i, q, qtypes) + mutex_unlock(&q->lock); + + flush_warnings(dst, c->vfs_sb, &msgs); + + return ret; +} + +static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, + struct qc_dqblk *qdq) +{ + struct bkey_s_c_quota dq; + struct bch_memquota_type *q; + struct bch_memquota *mq; + unsigned i; + + BUG_ON(k.k->p.inode >= QTYP_NR); + + if (!((1U << k.k->p.inode) & enabled_qtypes(c))) + return 0; + + switch (k.k->type) { + case KEY_TYPE_quota: + dq = bkey_s_c_to_quota(k); + q = &c->quotas[k.k->p.inode]; + + mutex_lock(&q->lock); + mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); + if (!mq) { + mutex_unlock(&q->lock); + return -ENOMEM; + } + + for (i = 0; i < Q_COUNTERS; i++) { + mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); + mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); + } + + if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) + mq->c[Q_SPC].timer = qdq->d_spc_timer; + if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) + mq->c[Q_SPC].warns = qdq->d_spc_warns; + if (qdq && qdq->d_fieldmask & QC_INO_TIMER) + mq->c[Q_INO].timer = qdq->d_ino_timer; + if (qdq && qdq->d_fieldmask & QC_INO_WARNS) + mq->c[Q_INO].warns = qdq->d_ino_warns; + + mutex_unlock(&q->lock); + } + + return 0; +} + +void bch2_fs_quota_exit(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->quotas); i++) + genradix_free(&c->quotas[i].table); +} + +void bch2_fs_quota_init(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->quotas); i++) + mutex_init(&c->quotas[i].lock); +} + +static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) +{ + struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota); + + if (sb_quota) + return sb_quota; + + sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64)); + if (sb_quota) { + unsigned qtype, qc; + + for (qtype = 0; qtype < QTYP_NR; qtype++) + for (qc = 0; qc < Q_COUNTERS; qc++) + sb_quota->q[qtype].c[qc].timelimit = + cpu_to_le32(7 * 24 * 60 * 60); + } + + return sb_quota; +} + +static void bch2_sb_quota_read(struct bch_fs *c) +{ + struct bch_sb_field_quota *sb_quota; + unsigned i, j; + + sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota); + if (!sb_quota) + return; + + for (i = 0; i < QTYP_NR; i++) { + struct bch_memquota_type *q = &c->quotas[i]; + + for (j = 0; j < Q_COUNTERS; j++) { + q->limits[j].timelimit = + le32_to_cpu(sb_quota->q[i].c[j].timelimit); + q->limits[j].warnlimit = + le32_to_cpu(sb_quota->q[i].c[j].warnlimit); + } + } +} + +static int bch2_fs_quota_read_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct bch_snapshot_tree s_t; + int ret; + + ret = bch2_snapshot_tree_lookup(trans, + bch2_snapshot_tree(c, k.k->p.snapshot), &s_t); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, + snapshot_t(c, k.k->p.snapshot)->tree); + if (ret) + return ret; + + if (!s_t.master_subvol) + goto advance; + + ret = bch2_inode_find_by_inum_nowarn_trans(trans, + (subvol_inum) { + le32_to_cpu(s_t.master_subvol), + k.k->p.offset, + }, &u); + /* + * Inode might be deleted in this snapshot - the easiest way to handle + * that is to just skip it here: + */ + if (bch2_err_matches(ret, ENOENT)) + goto advance; + + if (ret) + return ret; + + bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, + KEY_TYPE_QUOTA_NOCHECK); + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + KEY_TYPE_QUOTA_NOCHECK); +advance: + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + return 0; +} + +int bch2_fs_quota_read(struct bch_fs *c) +{ + struct bch_sb_field_quota *sb_quota; + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + mutex_unlock(&c->sb_lock); + return -BCH_ERR_ENOSPC_sb_quota; + } + + bch2_sb_quota_read(c); + mutex_unlock(&c->sb_lock); + + trans = bch2_trans_get(c); + + ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas, + POS_MIN, BTREE_ITER_PREFETCH, k, + __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key2(trans, iter, BTREE_ID_inodes, + POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + bch2_fs_quota_read_inode(trans, &iter, k)); + + bch2_trans_put(trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* Enable/disable/delete quotas for an entire filesystem: */ + +static int bch2_quota_enable(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + int ret = 0; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + /* Accounting must be enabled at mount time: */ + if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) + return -EINVAL; + + /* Can't enable enforcement without accounting: */ + if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) + return -EINVAL; + + if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) + return -EINVAL; + + if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) + return -EINVAL; + + mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; + } + + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); + + if (uflags & FS_QUOTA_GDQ_ENFD) + SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); + + if (uflags & FS_QUOTA_PDQ_ENFD) + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); + + bch2_write_super(c); +unlock: + mutex_unlock(&c->sb_lock); + + return bch2_err_class(ret); +} + +static int bch2_quota_disable(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + mutex_lock(&c->sb_lock); + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); + + if (uflags & FS_QUOTA_GDQ_ENFD) + SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); + + if (uflags & FS_QUOTA_PDQ_ENFD) + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +static int bch2_quota_remove(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + if (uflags & FS_USER_QUOTA) { + if (c->opts.usrquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_USR, 0), + POS(QTYP_USR, U64_MAX), + 0, NULL); + if (ret) + return ret; + } + + if (uflags & FS_GROUP_QUOTA) { + if (c->opts.grpquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_GRP, 0), + POS(QTYP_GRP, U64_MAX), + 0, NULL); + if (ret) + return ret; + } + + if (uflags & FS_PROJ_QUOTA) { + if (c->opts.prjquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_PRJ, 0), + POS(QTYP_PRJ, U64_MAX), + 0, NULL); + if (ret) + return ret; + } + + return 0; +} + +/* + * Return quota status information, such as enforcements, quota file inode + * numbers etc. + */ +static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) +{ + struct bch_fs *c = sb->s_fs_info; + unsigned qtypes = enabled_qtypes(c); + unsigned i; + + memset(state, 0, sizeof(*state)); + + for (i = 0; i < QTYP_NR; i++) { + state->s_state[i].flags |= QCI_SYSFILE; + + if (!(qtypes & (1 << i))) + continue; + + state->s_state[i].flags |= QCI_ACCT_ENABLED; + + state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; + state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; + + state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; + state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; + } + + return 0; +} + +/* + * Adjust quota timers & warnings + */ +static int bch2_quota_set_info(struct super_block *sb, int type, + struct qc_info *info) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + int ret = 0; + + if (0) { + struct printbuf buf = PRINTBUF; + + qc_info_to_text(&buf, info); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + if (type >= QTYP_NR) + return -EINVAL; + + if (!((1 << type) & enabled_qtypes(c))) + return -ESRCH; + + if (info->i_fieldmask & + ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) + return -EINVAL; + + mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; + } + + if (info->i_fieldmask & QC_SPC_TIMER) + sb_quota->q[type].c[Q_SPC].timelimit = + cpu_to_le32(info->i_spc_timelimit); + + if (info->i_fieldmask & QC_SPC_WARNS) + sb_quota->q[type].c[Q_SPC].warnlimit = + cpu_to_le32(info->i_spc_warnlimit); + + if (info->i_fieldmask & QC_INO_TIMER) + sb_quota->q[type].c[Q_INO].timelimit = + cpu_to_le32(info->i_ino_timelimit); + + if (info->i_fieldmask & QC_INO_WARNS) + sb_quota->q[type].c[Q_INO].warnlimit = + cpu_to_le32(info->i_ino_warnlimit); + + bch2_sb_quota_read(c); + + bch2_write_super(c); +unlock: + mutex_unlock(&c->sb_lock); + + return bch2_err_class(ret); +} + +/* Get/set individual quotas: */ + +static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) +{ + dst->d_space = src->c[Q_SPC].v << 9; + dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; + dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; + dst->d_spc_timer = src->c[Q_SPC].timer; + dst->d_spc_warns = src->c[Q_SPC].warns; + + dst->d_ino_count = src->c[Q_INO].v; + dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; + dst->d_ino_softlimit = src->c[Q_INO].softlimit; + dst->d_ino_timer = src->c[Q_INO].timer; + dst->d_ino_warns = src->c[Q_INO].warns; +} + +static int bch2_get_quota(struct super_block *sb, struct kqid kqid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_memquota_type *q = &c->quotas[kqid.type]; + qid_t qid = from_kqid(&init_user_ns, kqid); + struct bch_memquota *mq; + + memset(qdq, 0, sizeof(*qdq)); + + mutex_lock(&q->lock); + mq = genradix_ptr(&q->table, qid); + if (mq) + __bch2_quota_get(qdq, mq); + mutex_unlock(&q->lock); + + return 0; +} + +static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_memquota_type *q = &c->quotas[kqid->type]; + qid_t qid = from_kqid(&init_user_ns, *kqid); + struct genradix_iter iter; + struct bch_memquota *mq; + int ret = 0; + + mutex_lock(&q->lock); + + genradix_for_each_from(&q->table, iter, mq, qid) + if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { + __bch2_quota_get(qdq, mq); + *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); + goto found; + } + + ret = -ENOENT; +found: + mutex_unlock(&q->lock); + return bch2_err_class(ret); +} + +static int bch2_set_quota_trans(struct btree_trans *trans, + struct bkey_i_quota *new_quota, + struct qc_dqblk *qdq) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + ret = bkey_err(k); + if (unlikely(ret)) + return ret; + + if (k.k->type == KEY_TYPE_quota) + new_quota->v = *bkey_s_c_to_quota(k).v; + + if (qdq->d_fieldmask & QC_SPC_SOFT) + new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); + if (qdq->d_fieldmask & QC_SPC_HARD) + new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); + + if (qdq->d_fieldmask & QC_INO_SOFT) + new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); + if (qdq->d_fieldmask & QC_INO_HARD) + new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); + + ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_set_quota(struct super_block *sb, struct kqid qid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bkey_i_quota new_quota; + int ret; + + if (0) { + struct printbuf buf = PRINTBUF; + + qc_dqblk_to_text(&buf, qdq); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + bkey_quota_init(&new_quota.k_i); + new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_set_quota_trans(trans, &new_quota, qdq)) ?: + __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); + + return bch2_err_class(ret); +} + +const struct quotactl_ops bch2_quotactl_operations = { + .quota_enable = bch2_quota_enable, + .quota_disable = bch2_quota_disable, + .rm_xquota = bch2_quota_remove, + + .get_state = bch2_quota_get_state, + .set_info = bch2_quota_set_info, + + .get_dqblk = bch2_get_quota, + .get_nextdqblk = bch2_get_next_quota, + .set_dqblk = bch2_set_quota, +}; + +#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h new file mode 100644 index 000000000000..884f601f41c4 --- /dev/null +++ b/fs/bcachefs/quota.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_H +#define _BCACHEFS_QUOTA_H + +#include "inode.h" +#include "quota_types.h" + +enum bkey_invalid_flags; +extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + +int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_quota ((struct bkey_ops) { \ + .key_invalid = bch2_quota_invalid, \ + .val_to_text = bch2_quota_to_text, \ + .min_val_size = 32, \ +}) + +static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) +{ + return (struct bch_qid) { + .q[QTYP_USR] = u->bi_uid, + .q[QTYP_GRP] = u->bi_gid, + .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, + }; +} + +static inline unsigned enabled_qtypes(struct bch_fs *c) +{ + return ((c->opts.usrquota << QTYP_USR)| + (c->opts.grpquota << QTYP_GRP)| + (c->opts.prjquota << QTYP_PRJ)); +} + +#ifdef CONFIG_BCACHEFS_QUOTA + +int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, + s64, enum quota_acct_mode); + +int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, + struct bch_qid, u64, enum quota_acct_mode); + +void bch2_fs_quota_exit(struct bch_fs *); +void bch2_fs_quota_init(struct bch_fs *); +int bch2_fs_quota_read(struct bch_fs *); + +extern const struct quotactl_ops bch2_quotactl_operations; + +#else + +static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, + enum quota_counters counter, s64 v, + enum quota_acct_mode mode) +{ + return 0; +} + +static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, + struct bch_qid dst, + struct bch_qid src, u64 space, + enum quota_acct_mode mode) +{ + return 0; +} + +static inline void bch2_fs_quota_exit(struct bch_fs *c) {} +static inline void bch2_fs_quota_init(struct bch_fs *c) {} +static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } + +#endif + +#endif /* _BCACHEFS_QUOTA_H */ diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h new file mode 100644 index 000000000000..6a136083d389 --- /dev/null +++ b/fs/bcachefs/quota_types.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_TYPES_H +#define _BCACHEFS_QUOTA_TYPES_H + +#include <linux/generic-radix-tree.h> + +struct bch_qid { + u32 q[QTYP_NR]; +}; + +enum quota_acct_mode { + KEY_TYPE_QUOTA_PREALLOC, + KEY_TYPE_QUOTA_WARN, + KEY_TYPE_QUOTA_NOCHECK, +}; + +struct memquota_counter { + u64 v; + u64 hardlimit; + u64 softlimit; + s64 timer; + int warns; + int warning_issued; +}; + +struct bch_memquota { + struct memquota_counter c[Q_COUNTERS]; +}; + +typedef GENRADIX(struct bch_memquota) bch_memquota_table; + +struct quota_limit { + u32 timelimit; + u32 warnlimit; +}; + +struct bch_memquota_type { + struct quota_limit limits[Q_COUNTERS]; + bch_memquota_table table; + struct mutex lock; +}; + +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 index 000000000000..3319190b8d9c --- /dev/null +++ b/fs/bcachefs/rebalance.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "clock.h" +#include "compress.h" +#include "disk_groups.h" +#include "errcode.h" +#include "error.h" +#include "inode.h" +#include "move.h" +#include "rebalance.h" +#include "subvolume.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/sched/cputime.h> + +#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) + +static const char * const bch2_rebalance_state_strs[] = { +#define x(t) #t, + BCH_REBALANCE_STATES() + NULL +#undef x +}; + +static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie *cookie; + u64 v; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); + ret = PTR_ERR_OR_ZERO(cookie); + if (ret) + goto err; + + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter.pos; + cookie->v.cookie = cpu_to_le64(v + 1); + + ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) +{ + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + __bch2_set_rebalance_needs_scan(trans, inum)); + rebalance_wakeup(c); + return ret; +} + +int bch2_set_fs_needs_rebalance(struct bch_fs *c) +{ + return bch2_set_rebalance_needs_scan(c, 0); +} + +static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 v; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, + SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + + if (v == cookie) + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, + struct btree_iter *work_iter) +{ + return !kthread_should_stop() + ? bch2_btree_iter_peek(work_iter) + : bkey_s_c_null; +} + +static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + extent_entry_drop(bkey_i_to_s(n), + (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n))); + return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + +static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct bpos work_pos, + struct btree_iter *extent_iter, + struct data_update_opts *data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, + work_pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek_slot(extent_iter); + if (bkey_err(k)) + return k; + + const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; + if (!r) { + /* raced due to btree write buffer, nothing to do */ + return bkey_s_c_null; + } + + memset(data_opts, 0, sizeof(*data_opts)); + + data_opts->rewrite_ptrs = + bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); + data_opts->target = r->target; + + if (!data_opts->rewrite_ptrs) { + /* + * device we would want to write to offline? devices in target + * changed? + * + * We'll now need a full scan before this extent is picked up + * again: + */ + int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); + return bkey_s_c_null; + } + + return k; +} + +noinline_for_stack +static int do_rebalance_extent(struct moving_context *ctxt, + struct bpos work_pos, + struct btree_iter *extent_iter) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &trans->c->rebalance; + struct data_update_opts data_opts; + struct bch_io_opts io_opts; + struct bkey_s_c k; + struct bkey_buf sk; + int ret; + + ctxt->stats = &r->work_stats; + r->state = BCH_REBALANCE_working; + + bch2_bkey_buf_init(&sk); + + ret = bkey_err(k = next_rebalance_extent(trans, work_pos, + extent_iter, &data_opts)); + if (ret || !k.k) + goto out; + + ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + if (ret) + goto out; + + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts); + if (ret) { + if (bch2_err_matches(ret, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + ret = -BCH_ERR_transaction_restart_nested; + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto out; + + /* skip it and continue, XXX signal failure */ + ret = 0; + } +out: + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned target, compression; + + if (k.k->p.inode) { + target = io_opts->background_target; + compression = io_opts->background_compression ?: io_opts->compression; + } else { + const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); + + target = r ? r->target : io_opts->background_target; + compression = r ? r->compression : + (io_opts->background_compression ?: io_opts->compression); + } + + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); + data_opts->target = target; + return data_opts->rewrite_ptrs != 0; +} + +static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs_rebalance *r = &trans->c->rebalance; + int ret; + + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + ctxt->stats = &r->scan_stats; + + if (!inum) { + r->scan_start = BBPOS_MIN; + r->scan_end = BBPOS_MAX; + } else { + r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0)); + r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX)); + } + + r->state = BCH_REBALANCE_scanning; + + ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: + commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + bch2_move_stats_exit(&r->scan_stats, trans->c); + return ret; +} + +static void rebalance_wait(struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + struct io_clock *clock = &c->io_clock[WRITE]; + u64 now = atomic64_read(&clock->now); + u64 min_member_capacity = bch2_min_rw_member_capacity(c); + + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + + r->wait_iotime_end = now + (min_member_capacity >> 6); + + if (r->state != BCH_REBALANCE_waiting) { + r->wait_iotime_start = now; + r->wait_wallclock_start = ktime_get_real_ns(); + r->state = BCH_REBALANCE_waiting; + } + + bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); +} + +static int do_rebalance(struct moving_context *ctxt) +{ + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; + struct btree_iter rebalance_work_iter, extent_iter = { NULL }; + struct bkey_s_c k; + int ret = 0; + + bch2_move_stats_init(&r->work_stats, "rebalance_work"); + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + + bch2_trans_iter_init(trans, &rebalance_work_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS); + + while (!bch2_move_ratelimit(ctxt) && + !kthread_wait_freezable(r->enabled)) { + bch2_trans_begin(trans); + + ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter)); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret || !k.k) + break; + + ret = k.k->type == KEY_TYPE_cookie + ? do_rebalance_scan(ctxt, k.k->p.inode, + le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)) + : do_rebalance_extent(ctxt, k.k->p, &extent_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_btree_iter_advance(&rebalance_work_iter); + } + + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_work_iter); + bch2_move_stats_exit(&r->scan_stats, c); + + if (!ret && + !kthread_should_stop() && + !atomic64_read(&r->work_stats.sectors_seen) && + !atomic64_read(&r->scan_stats.sectors_seen)) { + bch2_trans_unlock_long(trans); + rebalance_wait(c); + } + + if (!bch2_err_matches(ret, EROFS)) + bch_err_fn(c, ret); + return ret; +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct bch_fs_rebalance *r = &c->rebalance; + struct moving_context ctxt; + int ret; + + set_freezable(); + + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, + writepoint_ptr(&c->rebalance_write_point), + true); + + while (!kthread_should_stop() && + !(ret = do_rebalance(&ctxt))) + ; + + bch2_moving_ctxt_exit(&ctxt); + + return 0; +} + +void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + + prt_str(out, bch2_rebalance_state_strs[r->state]); + prt_newline(out); + printbuf_indent_add(out, 2); + + switch (r->state) { + case BCH_REBALANCE_waiting: { + u64 now = atomic64_read(&c->io_clock[WRITE].now); + + prt_str(out, "io wait duration: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + prt_newline(out); + + prt_str(out, "io wait remaining: "); + bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + prt_newline(out); + + prt_str(out, "duration waited: "); + bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); + prt_newline(out); + break; + } + case BCH_REBALANCE_working: + bch2_move_stats_to_text(out, &r->work_stats); + break; + case BCH_REBALANCE_scanning: + bch2_move_stats_to_text(out, &r->scan_stats); + break; + } + prt_newline(out); + printbuf_indent_sub(out, 2); +} + +void bch2_rebalance_stop(struct bch_fs *c) +{ + struct task_struct *p; + + c->rebalance.pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->rebalance.pd.rate); + + p = rcu_dereference_protected(c->rebalance.thread, 1); + c->rebalance.thread = NULL; + + if (p) { + /* for sychronizing with rebalance_wakeup() */ + synchronize_rcu(); + + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_rebalance_start(struct bch_fs *c) +{ + struct task_struct *p; + int ret; + + if (c->rebalance.thread) + return 0; + + if (c->opts.nochanges) + return 0; + + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); + ret = PTR_ERR_OR_ZERO(p); + if (ret) { + bch_err_msg(c, ret, "creating rebalance thread"); + return ret; + } + + get_task_struct(p); + rcu_assign_pointer(c->rebalance.thread, p); + wake_up_process(p); + return 0; +} + +void bch2_fs_rebalance_init(struct bch_fs *c) +{ + bch2_pd_controller_init(&c->rebalance.pd); +} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h new file mode 100644 index 000000000000..28a52638f16c --- /dev/null +++ b/fs/bcachefs/rebalance.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_H +#define _BCACHEFS_REBALANCE_H + +#include "rebalance_types.h" + +int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); +int bch2_set_fs_needs_rebalance(struct bch_fs *); + +static inline void rebalance_wakeup(struct bch_fs *c) +{ + struct task_struct *p; + + rcu_read_lock(); + p = rcu_dereference(c->rebalance.thread); + if (p) + wake_up_process(p); + rcu_read_unlock(); +} + +void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); + +void bch2_rebalance_stop(struct bch_fs *); +int bch2_rebalance_start(struct bch_fs *); +void bch2_fs_rebalance_init(struct bch_fs *); + +#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h new file mode 100644 index 000000000000..0fffb536c1d0 --- /dev/null +++ b/fs/bcachefs/rebalance_types.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_TYPES_H +#define _BCACHEFS_REBALANCE_TYPES_H + +#include "bbpos_types.h" +#include "move_types.h" + +#define BCH_REBALANCE_STATES() \ + x(waiting) \ + x(working) \ + x(scanning) + +enum bch_rebalance_states { +#define x(t) BCH_REBALANCE_##t, + BCH_REBALANCE_STATES() +#undef x +}; + +struct bch_fs_rebalance { + struct task_struct __rcu *thread; + struct bch_pd_controller pd; + + enum bch_rebalance_states state; + u64 wait_iotime_start; + u64 wait_iotime_end; + u64 wait_wallclock_start; + + struct bch_move_stats work_stats; + + struct bbpos scan_start; + struct bbpos scan_end; + struct bch_move_stats scan_stats; + + unsigned enabled:1; +}; + +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 index 000000000000..c7d9074c82d9 --- /dev/null +++ b/fs/bcachefs/recovery.c @@ -0,0 +1,1060 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "backpointers.h" +#include "bkey_buf.h" +#include "alloc_background.h" +#include "btree_gc.h" +#include "btree_journal_iter.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "buckets.h" +#include "dirent.h" +#include "ec.h" +#include "errcode.h" +#include "error.h" +#include "fs-common.h" +#include "fsck.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "lru.h" +#include "logged_ops.h" +#include "move.h" +#include "quota.h" +#include "rebalance.h" +#include "recovery.h" +#include "replicas.h" +#include "sb-clean.h" +#include "snapshot.h" +#include "subvolume.h" +#include "super-io.h" + +#include <linux/sort.h> +#include <linux/stat.h> + +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +static bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + return true; + default: + return false; + } +} + +/* for -o reconstruct_alloc: */ +static void drop_alloc_keys(struct journal_keys *keys) +{ + size_t src, dst; + + for (src = 0, dst = 0; src < keys->nr; src++) + if (!btree_id_is_alloc(keys->d[src].btree_id)) + keys->d[dst++] = keys->d[src]; + + keys->nr = dst; +} + +/* + * Btree node pointers have a field to stack a pointer to the in memory btree + * node; we need to zero out this field when reading in btree nodes, or when + * reading in keys from the journal: + */ +static void zero_out_btree_mem_ptr(struct journal_keys *keys) +{ + struct journal_key *i; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->k->k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; +} + +/* journal replay: */ + +static void replay_now_at(struct journal *j, u64 seq) +{ + BUG_ON(seq < j->replay_journal_seq); + + seq = min(seq, j->replay_journal_seq_end); + + while (j->replay_journal_seq < seq) + bch2_journal_pin_put(j, j->replay_journal_seq++); +} + +static int bch2_journal_replay_key(struct btree_trans *trans, + struct journal_key *k) +{ + struct btree_iter iter; + unsigned iter_flags = + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS; + unsigned update_flags = BTREE_TRIGGER_NORUN; + int ret; + + /* + * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * keep the key cache coherent with the underlying btree. Nothing + * besides the allocator is doing updates yet so we don't need key cache + * coherency for non-alloc btrees, and key cache fills for snapshots + * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * the snapshots recovery pass runs. + */ + if (!k->level && k->btree_id == BTREE_ID_alloc) + iter_flags |= BTREE_ITER_CACHED; + else + update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; + + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + /* Must be checked with btree locked: */ + if (k->overwritten) + goto out; + + ret = bch2_trans_update(trans, &iter, k->k, update_flags); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int journal_sort_seq_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = *((const struct journal_key **)_l); + const struct journal_key *r = *((const struct journal_key **)_r); + + return cmp_int(l->journal_seq, r->journal_seq); +} + +static int bch2_journal_replay(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_key **keys_sorted, *k; + struct journal *j = &c->journal; + u64 start_seq = c->journal_replay_seq_start; + u64 end_seq = c->journal_replay_seq_start; + size_t i; + int ret = 0; + + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL); + if (!keys_sorted) + return -BCH_ERR_ENOMEM_journal_replay; + + for (i = 0; i < keys->nr; i++) + keys_sorted[i] = &keys->d[i]; + + sort(keys_sorted, keys->nr, + sizeof(keys_sorted[0]), + journal_sort_seq_cmp, NULL); + + if (keys->nr) { + ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", + keys->nr, start_seq, end_seq); + if (ret) + goto err; + } + + BUG_ON(!atomic_read(&keys->ref)); + + for (i = 0; i < keys->nr; i++) { + k = keys_sorted[i]; + + cond_resched(); + + replay_now_at(j, k->journal_seq); + + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| + (!k->allocated + ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim + : 0), + bch2_journal_replay_key(trans, k)); + if (ret) { + bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", + bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret)); + goto err; + } + } + + if (!c->opts.keep_journal) + bch2_journal_keys_put_initial(c); + + replay_now_at(j, j->replay_journal_seq_end); + j->replay_journal_seq = 0; + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); + ret = bch2_journal_error(j); + + if (keys->nr && !ret) + bch2_journal_log_msg(c, "journal replay finished"); +err: + kvfree(keys_sorted); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* journal replay early: */ + +static int journal_replay_entry_early(struct bch_fs *c, + struct jset_entry *entry) +{ + int ret = 0; + + switch (entry->type) { + case BCH_JSET_ENTRY_btree_root: { + struct btree_root *r; + + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { + ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); + if (ret) + return ret; + } + + r = bch2_btree_id_root(c, entry->btree_id); + + if (entry->u64s) { + r->level = entry->level; + bkey_copy(&r->key, (struct bkey_i *) entry->start); + r->error = 0; + } else { + r->error = -EIO; + } + r->alive = true; + break; + } + case BCH_JSET_ENTRY_usage: { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + switch (entry->btree_id) { + case BCH_FS_USAGE_reserved: + if (entry->level < BCH_REPLICAS_MAX) + c->usage_base->persistent_reserved[entry->level] = + le64_to_cpu(u->v); + break; + case BCH_FS_USAGE_inodes: + c->usage_base->nr_inodes = le64_to_cpu(u->v); + break; + case BCH_FS_USAGE_key_version: + atomic64_set(&c->key_version, + le64_to_cpu(u->v)); + break; + } + + break; + } + case BCH_JSET_ENTRY_data_usage: { + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + + ret = bch2_replicas_set_usage(c, &u->r, + le64_to_cpu(u->v)); + break; + } + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + + for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + + break; + } + case BCH_JSET_ENTRY_blacklist: { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->seq), + le64_to_cpu(bl_entry->seq) + 1); + break; + } + case BCH_JSET_ENTRY_blacklist_v2: { + struct jset_entry_blacklist_v2 *bl_entry = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->start), + le64_to_cpu(bl_entry->end) + 1); + break; + } + case BCH_JSET_ENTRY_clock: { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); + } + } + + return ret; +} + +static int journal_replay_early(struct bch_fs *c, + struct bch_sb_field_clean *clean) +{ + struct jset_entry *entry; + int ret; + + if (clean) { + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } else { + struct genradix_iter iter; + struct journal_replay *i, **_i; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + vstruct_for_each(&i->j, entry) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } + } + + bch2_fs_usage_initialize(c); + + return 0; +} + +/* sb clean section: */ + +static int read_btree_roots(struct bch_fs *c) +{ + unsigned i; + int ret = 0; + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->alive) + continue; + + if (btree_id_is_alloc(i) && + c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + continue; + } + + if (r->error) { + __fsck_err(c, + btree_id_is_alloc(i) + ? FSCK_CAN_IGNORE : 0, + btree_root_bkey_invalid, + "invalid btree root %s", + bch2_btree_id_str(i)); + if (i == BTREE_ID_alloc) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } + + ret = bch2_btree_root_read(c, i, &r->key, r->level); + if (ret) { + fsck_err(c, + btree_root_read_error, + "error reading btree root %s", + bch2_btree_id_str(i)); + if (btree_id_is_alloc(i)) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + ret = 0; + } + } + + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->b) { + r->alive = false; + r->level = 0; + bch2_btree_root_alloc(c, i); + } + } +fsck_err: + return ret; +} + +static int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* set bi_subvol on root inode */ +noinline_for_stack +static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + __bch2_fs_upgrade_for_subvolumes(trans)); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +const char * const bch2_recovery_passes[] = { +#define x(_fn, _when) #_fn, + BCH_RECOVERY_PASSES() +#undef x + NULL +}; + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + unsigned when; +}; + +static struct recovery_pass_fn recovery_pass_fns[] = { +#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +static void check_version_upgrade(struct bch_fs *c) +{ + unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); + unsigned latest_version = bcachefs_metadata_version_current; + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = 0; + u64 recovery_passes; + + if (old_version < bcachefs_metadata_required_upgrade_below) { + if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || + latest_compatible < bcachefs_metadata_required_upgrade_below) + new_version = latest_version; + else + new_version = latest_compatible; + } else { + switch (c->opts.version_upgrade) { + case BCH_VERSION_UPGRADE_compatible: + new_version = latest_compatible; + break; + case BCH_VERSION_UPGRADE_incompatible: + new_version = latest_version; + break; + case BCH_VERSION_UPGRADE_none: + new_version = old_version; + break; + } + } + + if (new_version > old_version) { + struct printbuf buf = PRINTBUF; + + if (old_version < bcachefs_metadata_required_upgrade_below) + prt_str(&buf, "Version upgrade required:\n"); + + if (old_version != c->sb.version) { + prt_str(&buf, "Version upgrade from "); + bch2_version_to_text(&buf, c->sb.version_upgrade_complete); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, " incomplete\n"); + } + + prt_printf(&buf, "Doing %s version upgrade from ", + BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) + ? "incompatible" : "compatible"); + bch2_version_to_text(&buf, old_version); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, new_version); + prt_newline(&buf); + + recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); + if (recovery_passes) { + if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) + prt_str(&buf, "fsck required"); + else { + prt_str(&buf, "running recovery passes: "); + prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); + } + + c->recovery_passes_explicit |= recovery_passes; + c->opts.fix_errors = FSCK_FIX_yes; + } + + bch_info(c, "%s", buf.buf); + + mutex_lock(&c->sb_lock); + bch2_sb_upgrade(c, new_version); + mutex_unlock(&c->sb_lock); + + printbuf_exit(&buf); + } +} + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; + + if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) + return false; + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + int ret; + + c->curr_recovery_pass = pass; + + if (should_run_recovery_pass(c, pass)) { + struct recovery_pass_fn *p = recovery_pass_fns + pass; + + if (!(p->when & PASS_SILENT)) + printk(KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + printk(KERN_CONT " done\n"); + + c->recovery_passes_complete |= BIT_ULL(pass); + } + + return 0; +} + +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + continue; + if (ret) + break; + c->curr_recovery_pass++; + } + + return ret; +} + +int bch2_fs_recovery(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean = NULL; + struct jset *last_journal_entry = NULL; + u64 last_seq = 0, blacklist_seq, journal_seq; + bool write_sb = false; + int ret = 0; + + if (c->sb.clean) { + clean = bch2_read_superblock_clean(c); + ret = PTR_ERR_OR_ZERO(clean); + if (ret) + goto err; + + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + } else { + bch_info(c, "recovering from unclean shutdown"); + } + + if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { + bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); + ret = -EINVAL; + goto err; + } + + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + ret = -EINVAL; + goto err; + } + + if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) + check_version_upgrade(c); + + if (c->opts.fsck && c->opts.norecovery) { + bch_err(c, "cannot select both norecovery and fsck"); + ret = -EINVAL; + goto err; + } + + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); + goto err; + } + + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + struct genradix_iter iter; + struct journal_replay **i; + + bch_verbose(c, "starting journal read"); + ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); + if (ret) + goto err; + + /* + * note: cmd_list_journal needs the blacklist table fully up to date so + * it can asterisk ignored journal entries: + */ + if (c->opts.read_journal_only) + goto out; + + genradix_for_each_reverse(&c->journal_entries, iter, i) + if (*i && !(*i)->ignore) { + last_journal_entry = &(*i)->j; + break; + } + + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, + clean_but_journal_not_empty, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } + + if (!last_journal_entry) { + fsck_err_on(!c->sb.clean, c, + dirty_but_no_journal_entries, + "no journal entries found"); + if (clean) + goto use_clean; + + genradix_for_each_reverse(&c->journal_entries, iter, i) + if (*i) { + last_journal_entry = &(*i)->j; + (*i)->ignore = false; + /* + * This was probably a NO_FLUSH entry, + * so last_seq was garbage - but we know + * we're only using a single journal + * entry, set it here: + */ + (*i)->j.last_seq = (*i)->j.seq; + break; + } + } + + ret = bch2_journal_keys_sort(c); + if (ret) + goto err; + + if (c->sb.clean && last_journal_entry) { + ret = bch2_verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; + } + } else { +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); + ret = -BCH_ERR_fsck_repair_impossible; + goto err; + + } + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + + c->journal_replay_seq_start = last_seq; + c->journal_replay_seq_end = blacklist_seq - 1; + + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + drop_alloc_keys(&c->journal_keys); + } + + zero_out_btree_mem_ptr(&c->journal_keys); + + ret = journal_replay_early(c, clean); + if (ret) + goto err; + + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + + if (blacklist_seq != journal_seq) { + ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", + blacklist_seq, journal_seq) ?: + bch2_journal_seq_blacklist_add(c, + blacklist_seq, journal_seq); + if (ret) { + bch_err(c, "error creating new journal seq blacklist entry"); + goto err; + } + } + + ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", + journal_seq, last_seq, blacklist_seq - 1) ?: + bch2_fs_journal_start(&c->journal, journal_seq); + if (ret) + goto err; + + if (c->opts.reconstruct_alloc) + bch2_journal_log_msg(c, "dropping alloc info"); + + /* + * Skip past versions that might have possibly been used (as nonces), + * but hadn't had their pointers written: + */ + if (c->sb.encryption_type && !c->sb.clean) + atomic64_add(1 << 16, &c->key_version); + + ret = read_btree_roots(c); + if (ret) + goto err; + + if (c->opts.fsck && + (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || + BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))) + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + + ret = bch2_run_recovery_passes(c); + if (ret) + goto err; + + /* If we fixed errors, verify that fs is actually clean now: */ + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && + !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && + !test_bit(BCH_FS_ERROR, &c->flags)) { + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); + clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + + ret = bch2_run_recovery_passes(c); + if (ret) + goto err; + + if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || + test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + bch_err(c, "Second fsck run was not clean"); + set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + } + + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + } + + if (enabled_qtypes(c)) { + bch_verbose(c, "reading quotas"); + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + bch_verbose(c, "quotas done"); + } + + mutex_lock(&c->sb_lock); + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) { + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version); + write_sb = true; + } + + if (!test_bit(BCH_FS_ERROR, &c->flags)) { + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + write_sb = true; + } + + if (c->opts.fsck && + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); + write_sb = true; + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + + bch2_move_stats_init(&stats, "recovery"); + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c) ?: + bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; + bch_info(c, "scanning for old btree nodes done"); + } + + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); + + ret = 0; +out: + set_bit(BCH_FS_FSCK_DONE, &c->flags); + bch2_flush_fsck_errs(c); + + if (!c->opts.keep_journal && + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + bch2_journal_keys_put_initial(c); + kfree(clean); + + if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { + bch2_fs_read_write_early(c); + bch2_delete_dead_snapshots_async(c); + } + + if (ret) + bch_err_fn(c, ret); + return ret; +err: +fsck_err: + bch2_fs_emergency_read_only(c); + goto out; +} + +int bch2_fs_initialize(struct bch_fs *c) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; + struct bkey_inode_buf packed_inode; + struct qstr lostfound = QSTR("lost+found"); + struct bch_dev *ca; + unsigned i; + int ret; + + bch_notice(c, "initializing new filesystem"); + + mutex_lock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + + bch2_sb_maybe_downgrade(c); + + if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { + bch2_sb_upgrade(c, bcachefs_metadata_version_current); + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + + c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + + for_each_member_device(ca, c, i) + bch2_dev_usage_init(ca); + + ret = bch2_fs_journal_alloc(c); + if (ret) + goto err; + + /* + * journal_res_get() will crash if called before this has + * set up the journal.pin FIFO and journal.cur pointer: + */ + bch2_fs_journal_start(&c->journal, 1); + bch2_journal_set_replay_done(&c->journal); + + ret = bch2_fs_read_write_early(c); + if (ret) + goto err; + + /* + * Write out the superblock and journal buckets, now that we can do + * btree updates + */ + bch_verbose(c, "marking superblocks"); + ret = bch2_trans_mark_dev_sbs(c); + bch_err_msg(c, ret, "marking superblocks"); + if (ret) + goto err; + + for_each_online_member(ca, c, i) + ca->new_fs_bucket_idx = 0; + + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; + + ret = bch2_initialize_subvolumes(c); + if (ret) + goto err; + + bch_verbose(c, "reading snapshots table"); + ret = bch2_snapshots_read(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + bch2_inode_pack(&packed_inode, &root_inode); + packed_inode.inode.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0); + if (ret) { + bch_err_msg(c, ret, "creating root directory"); + goto err; + } + + bch2_inode_init_early(c, &lostfound_inode); + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_create_trans(trans, + BCACHEFS_ROOT_SUBVOL_INUM, + &root_inode, &lostfound_inode, + &lostfound, + 0, 0, S_IFDIR|0700, 0, + NULL, NULL, (subvol_inum) { 0 }, 0)); + if (ret) { + bch_err_msg(c, ret, "creating lost+found"); + goto err; + } + + if (enabled_qtypes(c)) { + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + } + + ret = bch2_journal_flush(&c->journal); + if (ret) { + bch_err_msg(c, ret, "writing first journal entry"); + goto err; + } + + mutex_lock(&c->sb_lock); + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +err: + bch_err_fn(ca, ret); + return ret; +} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h new file mode 100644 index 000000000000..d266aae90200 --- /dev/null +++ b/fs/bcachefs/recovery.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_H +#define _BCACHEFS_RECOVERY_H + +extern const char * const bch2_recovery_passes[]; + +/* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return 0; + + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} + +u64 bch2_fsck_recovery_passes(void); + +int bch2_fs_recovery(struct bch_fs *); +int bch2_fs_initialize(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h new file mode 100644 index 000000000000..515e3d62c2ac --- /dev/null +++ b/fs/bcachefs/recovery_types.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_TYPES_H +#define _BCACHEFS_RECOVERY_TYPES_H + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) + +#define BCH_RECOVERY_PASSES() \ + x(alloc_read, PASS_ALWAYS) \ + x(stripes_read, PASS_ALWAYS) \ + x(initialize_subvolumes, 0) \ + x(snapshots_read, PASS_ALWAYS) \ + x(check_topology, 0) \ + x(check_allocations, PASS_FSCK) \ + x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, PASS_ALWAYS) \ + x(check_alloc_info, PASS_FSCK) \ + x(check_lrus, PASS_FSCK) \ + x(check_btree_backpointers, PASS_FSCK) \ + x(check_backpointers_to_extents,PASS_FSCK) \ + x(check_extents_to_backpointers,PASS_FSCK) \ + x(check_alloc_to_lru_refs, PASS_FSCK) \ + x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 0) \ + x(check_snapshot_trees, PASS_FSCK) \ + x(check_snapshots, PASS_FSCK) \ + x(check_subvols, PASS_FSCK) \ + x(delete_dead_snapshots, PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 0) \ + x(resume_logged_ops, PASS_ALWAYS) \ + x(check_inodes, PASS_FSCK) \ + x(check_extents, PASS_FSCK) \ + x(check_indirect_extents, PASS_FSCK) \ + x(check_dirents, PASS_FSCK) \ + x(check_xattrs, PASS_FSCK) \ + x(check_root, PASS_FSCK) \ + x(check_directory_structure, PASS_FSCK) \ + x(check_nlinks, PASS_FSCK) \ + x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ + x(fix_reflink_p, 0) \ + x(set_fs_needs_rebalance, 0) \ + +enum bch_recovery_pass { +#define x(n, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +#endif /* _BCACHEFS_RECOVERY_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 index 000000000000..37d16e04e671 --- /dev/null +++ b/fs/bcachefs/reflink.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "extents.h" +#include "inode.h" +#include "io_misc.h" +#include "io_write.h" +#include "rebalance.h" +#include "reflink.h" +#include "subvolume.h" +#include "super-io.h" + +#include <linux/sched/signal.h> + +static inline unsigned bkey_type_to_indirect(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_extent: + return KEY_TYPE_reflink_v; + case KEY_TYPE_inline_data: + return KEY_TYPE_indirect_inline_data; + default: + return 0; + } +} + +/* reflink pointers */ + +int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && + le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { + prt_printf(err, "idx < front_pad (%llu < %u)", + le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); + return -EINVAL; + } + + return 0; +} + +void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + prt_printf(out, "idx %llu front_pad %u back_pad %u", + le64_to_cpu(p.v->idx), + le32_to_cpu(p.v->front_pad), + le32_to_cpu(p.v->back_pad)); +} + +bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); + struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); + + /* + * Disabled for now, the triggers code needs to be reworked for merging + * of reflink pointers to work: + */ + return false; + + if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +/* indirect extents */ + +int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +#if 0 +Currently disabled, needs to be debugged: + +bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); + + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} +#endif + +static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) +{ + if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { + new->k.type = KEY_TYPE_deleted; + new->k.size = 0; + set_bkey_val_u64s(&new->k, 0);; + *flags &= ~BTREE_TRIGGER_INSERT; + } +} + +int bch2_trans_mark_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + check_indirect_extent_deleting(new, &flags); + + if (old.k->type == KEY_TYPE_reflink_v && + new->k.type == KEY_TYPE_reflink_v && + old.k->u64s == new->k.u64s && + !memcmp(bkey_s_c_to_reflink_v(old).v->start, + bkey_i_to_reflink_v(new)->v.start, + bkey_val_bytes(&new->k) - 8)) + return 0; + + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); +} + +/* indirect inline data */ + +int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + return 0; +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, + struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + prt_printf(out, "refcount %llu datalen %u: %*phN", + le64_to_cpu(d.v->refcount), datalen, + min(datalen, 32U), d.v->data); +} + +int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + check_indirect_extent_deleting(new, &flags); + + return 0; +} + +static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *orig) +{ + struct bch_fs *c = trans->c; + struct btree_iter reflink_iter = { NULL }; + struct bkey_s_c k; + struct bkey_i *r_v; + struct bkey_i_reflink_p *r_p; + __le64 *refcount; + int ret; + + if (orig->k.type == KEY_TYPE_inline_data) + bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); + + bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_prev(&reflink_iter); + ret = bkey_err(k); + if (ret) + goto err; + + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + + bkey_init(&r_v->k); + r_v->k.type = bkey_type_to_indirect(&orig->k); + r_v->k.p = reflink_iter.pos; + bch2_key_resize(&r_v->k, orig->k.size); + r_v->k.version = orig->k.version; + + set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); + + refcount = bkey_refcount(r_v); + *refcount = 0; + memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); + + ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); + if (ret) + goto err; + + /* + * orig is in a bkey_buf which statically allocates 5 64s for the val, + * so we know it will be big enough: + */ + orig->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(orig); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + + /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ +#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __underlying_memset(&r_p->v, 0, sizeof(r_p->v)); +#else + memset(&r_p->v, 0, sizeof(r_p->v)); +#endif + + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +err: + bch2_trans_iter_exit(trans, &reflink_iter); + + return ret; +} + +static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +{ + struct bkey_s_c k; + int ret; + + for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { + if (bkey_extent_is_unwritten(k)) + continue; + + if (bkey_extent_is_data(k.k)) + return k; + } + + if (bkey_ge(iter->pos, end)) + bch2_btree_iter_set_pos(iter, end); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +s64 bch2_remap_range(struct bch_fs *c, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, + u64 remap_sectors, + u64 new_i_size, s64 *i_sectors_delta) +{ + struct btree_trans *trans; + struct btree_iter dst_iter, src_iter; + struct bkey_s_c src_k; + struct bkey_buf new_dst, new_src; + struct bpos dst_start = POS(dst_inum.inum, dst_offset); + struct bpos src_start = POS(src_inum.inum, src_offset); + struct bpos dst_end = dst_start, src_end = src_start; + struct bch_io_opts opts; + struct bpos src_want; + u64 dst_done = 0; + u32 dst_snapshot, src_snapshot; + int ret = 0, ret2 = 0; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) + return -BCH_ERR_erofs_no_writes; + + bch2_check_set_feature(c, BCH_FEATURE_reflink); + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + + bch2_bkey_buf_init(&new_dst); + bch2_bkey_buf_init(&new_src); + trans = bch2_trans_get(c); + + ret = bch2_inum_opts_get(trans, src_inum, &opts); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start, + BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); + + while ((ret == 0 || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) && + bkey_lt(dst_iter.pos, dst_end)) { + struct disk_reservation disk_res = { 0 }; + + bch2_trans_begin(trans); + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol, + &src_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + + ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, + &dst_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + + dst_done = dst_iter.pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(&src_iter, src_want); + + src_k = get_next_src(&src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + continue; + + if (bkey_lt(src_want, src_iter.pos)) { + ret = bch2_fpunch_at(trans, &dst_iter, dst_inum, + min(dst_end.offset, + dst_iter.pos.offset + + src_iter.pos.offset - src_want.offset), + i_sectors_delta); + continue; + } + + if (src_k.k->type != KEY_TYPE_reflink_p) { + bch2_btree_iter_set_pos_to_extent_start(&src_iter); + + bch2_bkey_buf_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + + ret = bch2_make_extent_indirect(trans, &src_iter, + new_src.k); + if (ret) + continue; + + BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); + } + + if (src_k.k->type == KEY_TYPE_reflink_p) { + struct bkey_s_c_reflink_p src_p = + bkey_s_c_to_reflink_p(src_k); + struct bkey_i_reflink_p *dst_p = + bkey_reflink_p_init(new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + + (src_want.offset - + bkey_start_offset(src_k.k)); + + dst_p->v.idx = cpu_to_le64(offset); + } else { + BUG(); + } + + new_dst.k->k.p = dst_iter.pos; + bch2_key_resize(&new_dst.k->k, + min(src_k.k->p.offset - src_want.offset, + dst_end.offset - dst_iter.pos.offset)); + + ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, + opts.background_target, + opts.background_compression) ?: + bch2_extent_update(trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, + true); + bch2_disk_reservation_put(c, &disk_res); + } + bch2_trans_iter_exit(trans, &dst_iter); + bch2_trans_iter_exit(trans, &src_iter); + + BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); + BUG_ON(bkey_gt(dst_iter.pos, dst_end)); + + dst_done = dst_iter.pos.offset - dst_start.offset; + new_i_size = min(dst_iter.pos.offset << 9, new_i_size); + + do { + struct bch_inode_unpacked inode_u; + struct btree_iter inode_iter = { NULL }; + + bch2_trans_begin(trans); + + ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u, + dst_inum, BTREE_ITER_INTENT); + + if (!ret2 && + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; + ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + } + + bch2_trans_iter_exit(trans, &inode_iter); + } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); +err: + bch2_trans_put(trans); + bch2_bkey_buf_exit(&new_src, c); + bch2_bkey_buf_exit(&new_dst, c); + + bch2_write_ref_put(c, BCH_WRITE_REF_reflink); + + return dst_done ?: ret ?: ret2; +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 index 000000000000..8ccf3f9c4939 --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H + +enum bkey_invalid_flags; + +int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ + .key_merge = bch2_reflink_p_merge, \ + .trans_trigger = bch2_trans_mark_reflink_p, \ + .atomic_trigger = bch2_mark_reflink_p, \ + .min_val_size = 16, \ +}) + +int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); + +#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_reflink_v, \ + .atomic_trigger = bch2_mark_extent, \ + .min_val_size = 8, \ +}) + +int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_indirect_inline_data(struct btree_trans *, + enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, + unsigned); + +#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ + .key_invalid = bch2_indirect_inline_data_invalid, \ + .val_to_text = bch2_indirect_inline_data_to_text, \ + .trans_trigger = bch2_trans_mark_indirect_inline_data, \ + .min_val_size = 8, \ +}) + +static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_reflink_v: + return &bkey_s_c_to_reflink_v(k).v->refcount; + case KEY_TYPE_indirect_inline_data: + return &bkey_s_c_to_indirect_inline_data(k).v->refcount; + default: + return NULL; + } +} + +static inline __le64 *bkey_refcount(struct bkey_i *k) +{ + switch (k->k.type) { + case KEY_TYPE_reflink_v: + return &bkey_i_to_reflink_v(k)->v.refcount; + case KEY_TYPE_indirect_inline_data: + return &bkey_i_to_indirect_inline_data(k)->v.refcount; + default: + return NULL; + } +} + +s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, + subvol_inum, u64, u64, u64, s64 *); + +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c new file mode 100644 index 000000000000..2008fe8bf706 --- /dev/null +++ b/fs/bcachefs/replicas.c @@ -0,0 +1,1059 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets.h" +#include "journal.h" +#include "replicas.h" +#include "super-io.h" + +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, + struct bch_replicas_cpu *); + +/* Replicas tracking - in memory: */ + +static void verify_replicas_entry(struct bch_replicas_entry *e) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned i; + + BUG_ON(e->data_type >= BCH_DATA_NR); + BUG_ON(!e->nr_devs); + BUG_ON(e->nr_required > 1 && + e->nr_required >= e->nr_devs); + + for (i = 0; i + 1 < e->nr_devs; i++) + BUG_ON(e->devs[i] >= e->devs[i + 1]); +#endif +} + +void bch2_replicas_entry_sort(struct bch_replicas_entry *e) +{ + bubble_sort(e->devs, e->nr_devs, u8_cmp); +} + +static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) +{ + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); +} + +static void bch2_replicas_entry_v0_to_text(struct printbuf *out, + struct bch_replicas_entry_v0 *e) +{ + unsigned i; + + if (e->data_type < BCH_DATA_NR) + prt_printf(out, "%s", bch2_data_types[e->data_type]); + else + prt_printf(out, "(invalid data type %u)", e->data_type); + + prt_printf(out, ": %u [", e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + prt_printf(out, i ? " %u" : "%u", e->devs[i]); + prt_printf(out, "]"); +} + +void bch2_replicas_entry_to_text(struct printbuf *out, + struct bch_replicas_entry *e) +{ + unsigned i; + + if (e->data_type < BCH_DATA_NR) + prt_printf(out, "%s", bch2_data_types[e->data_type]); + else + prt_printf(out, "(invalid data type %u)", e->data_type); + + prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + prt_printf(out, i ? " %u" : "%u", e->devs[i]); + prt_printf(out, "]"); +} + +int bch2_replicas_entry_validate(struct bch_replicas_entry *r, + struct bch_sb *sb, + struct printbuf *err) +{ + if (!r->nr_devs) { + prt_printf(err, "no devices in entry "); + goto bad; + } + + if (r->nr_required > 1 && + r->nr_required >= r->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + goto bad; + } + + for (unsigned i = 0; i < r->nr_devs; i++) + if (!bch2_dev_exists(sb, r->devs[i])) { + prt_printf(err, "invalid device %u in entry ", r->devs[i]); + goto bad; + } + + return 0; +bad: + bch2_replicas_entry_to_text(err, r); + return -BCH_ERR_invalid_replicas_entry; +} + +void bch2_cpu_replicas_to_text(struct printbuf *out, + struct bch_replicas_cpu *r) +{ + struct bch_replicas_entry *e; + bool first = true; + + for_each_cpu_replicas_entry(r, e) { + if (!first) + prt_printf(out, " "); + first = false; + + bch2_replicas_entry_to_text(out, e); + } +} + +static void extent_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + r->nr_required = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; + + if (!p.has_ec) + r->devs[r->nr_devs++] = p.ptr.dev; + else + r->nr_required = 0; + } +} + +static void stripe_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) +{ + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + const struct bch_extent_ptr *ptr; + + r->nr_required = s.v->nr_blocks - s.v->nr_redundant; + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) + r->devs[r->nr_devs++] = ptr->dev; +} + +void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + struct bkey_s_c k) +{ + e->nr_devs = 0; + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + e->data_type = BCH_DATA_btree; + extent_to_replicas(k, e); + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + e->data_type = BCH_DATA_user; + extent_to_replicas(k, e); + break; + case KEY_TYPE_stripe: + e->data_type = BCH_DATA_parity; + stripe_to_replicas(k, e); + break; + } + + bch2_replicas_entry_sort(e); +} + +void bch2_devlist_to_replicas(struct bch_replicas_entry *e, + enum bch_data_type data_type, + struct bch_devs_list devs) +{ + unsigned i; + + BUG_ON(!data_type || + data_type == BCH_DATA_sb || + data_type >= BCH_DATA_NR); + + e->data_type = data_type; + e->nr_devs = 0; + e->nr_required = 1; + + for (i = 0; i < devs.nr; i++) + e->devs[e->nr_devs++] = devs.devs[i]; + + bch2_replicas_entry_sort(e); +} + +static struct bch_replicas_cpu +cpu_replicas_add_entry(struct bch_fs *c, + struct bch_replicas_cpu *old, + struct bch_replicas_entry *new_entry) +{ + unsigned i; + struct bch_replicas_cpu new = { + .nr = old->nr + 1, + .entry_size = max_t(unsigned, old->entry_size, + replicas_entry_bytes(new_entry)), + }; + + for (i = 0; i < new_entry->nr_devs; i++) + BUG_ON(!bch2_dev_exists2(c, new_entry->devs[i])); + + BUG_ON(!new_entry->data_type); + verify_replicas_entry(new_entry); + + new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); + if (!new.entries) + return new; + + for (i = 0; i < old->nr; i++) + memcpy(cpu_replicas_entry(&new, i), + cpu_replicas_entry(old, i), + old->entry_size); + + memcpy(cpu_replicas_entry(&new, old->nr), + new_entry, + replicas_entry_bytes(new_entry)); + + bch2_cpu_replicas_sort(&new); + return new; +} + +static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, + struct bch_replicas_entry *search) +{ + int idx, entry_size = replicas_entry_bytes(search); + + if (unlikely(entry_size > r->entry_size)) + return -1; + + verify_replicas_entry(search); + +#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) + idx = eytzinger0_find(r->entries, r->nr, r->entry_size, + entry_cmp, search); +#undef entry_cmp + + return idx < r->nr ? idx : -1; +} + +int bch2_replicas_entry_idx(struct bch_fs *c, + struct bch_replicas_entry *search) +{ + bch2_replicas_entry_sort(search); + + return __replicas_entry_idx(&c->replicas, search); +} + +static bool __replicas_has_entry(struct bch_replicas_cpu *r, + struct bch_replicas_entry *search) +{ + return __replicas_entry_idx(r, search) >= 0; +} + +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry *search) +{ + bool marked; + + if (!search->nr_devs) + return true; + + verify_replicas_entry(search); + + percpu_down_read(&c->mark_lock); + marked = __replicas_has_entry(&c->replicas, search) && + (likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search)); + percpu_up_read(&c->mark_lock); + + return marked; +} + +static void __replicas_table_update(struct bch_fs_usage *dst, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage *src, + struct bch_replicas_cpu *src_r) +{ + int src_idx, dst_idx; + + *dst = *src; + + for (src_idx = 0; src_idx < src_r->nr; src_idx++) { + if (!src->replicas[src_idx]) + continue; + + dst_idx = __replicas_entry_idx(dst_r, + cpu_replicas_entry(src_r, src_idx)); + BUG_ON(dst_idx < 0); + + dst->replicas[dst_idx] = src->replicas[src_idx]; + } +} + +static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage __percpu *src_p, + struct bch_replicas_cpu *src_r) +{ + unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; + struct bch_fs_usage *dst, *src = (void *) + bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); + + preempt_disable(); + dst = this_cpu_ptr(dst_p); + preempt_enable(); + + __replicas_table_update(dst, dst_r, src, src_r); +} + +/* + * Resize filesystem accounting: + */ +static int replicas_table_update(struct bch_fs *c, + struct bch_replicas_cpu *new_r) +{ + struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; + struct bch_fs_usage_online *new_scratch = NULL; + struct bch_fs_usage __percpu *new_gc = NULL; + struct bch_fs_usage *new_base = NULL; + unsigned i, bytes = sizeof(struct bch_fs_usage) + + sizeof(u64) * new_r->nr; + unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + + sizeof(u64) * new_r->nr; + int ret = 0; + + memset(new_usage, 0, sizeof(new_usage)); + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (!(new_usage[i] = __alloc_percpu_gfp(bytes, + sizeof(u64), GFP_KERNEL))) + goto err; + + if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || + !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || + (c->usage_gc && + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) + goto err; + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (c->usage[i]) + __replicas_table_update_pcpu(new_usage[i], new_r, + c->usage[i], &c->replicas); + if (c->usage_base) + __replicas_table_update(new_base, new_r, + c->usage_base, &c->replicas); + if (c->usage_gc) + __replicas_table_update_pcpu(new_gc, new_r, + c->usage_gc, &c->replicas); + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + swap(c->usage[i], new_usage[i]); + swap(c->usage_base, new_base); + swap(c->usage_scratch, new_scratch); + swap(c->usage_gc, new_gc); + swap(c->replicas, *new_r); +out: + free_percpu(new_gc); + kfree(new_scratch); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + free_percpu(new_usage[i]); + kfree(new_base); + return ret; +err: + bch_err(c, "error updating replicas table: memory allocation failure"); + ret = -BCH_ERR_ENOMEM_replicas_table; + goto out; +} + +static unsigned reserve_journal_replicas(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_replicas_entry *e; + unsigned journal_res_u64s = 0; + + /* nr_inodes: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); + + /* key_version: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); + + /* persistent_reserved: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * + BCH_REPLICAS_MAX; + + for_each_cpu_replicas_entry(r, e) + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + + e->nr_devs, sizeof(u64)); + return journal_res_u64s; +} + +noinline +static int bch2_mark_replicas_slowpath(struct bch_fs *c, + struct bch_replicas_entry *new_entry) +{ + struct bch_replicas_cpu new_r, new_gc; + int ret = 0; + + verify_replicas_entry(new_entry); + + memset(&new_r, 0, sizeof(new_r)); + memset(&new_gc, 0, sizeof(new_gc)); + + mutex_lock(&c->sb_lock); + + if (c->replicas_gc.entries && + !__replicas_has_entry(&c->replicas_gc, new_entry)) { + new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); + if (!new_gc.entries) { + ret = -BCH_ERR_ENOMEM_cpu_replicas; + goto err; + } + } + + if (!__replicas_has_entry(&c->replicas, new_entry)) { + new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); + if (!new_r.entries) { + ret = -BCH_ERR_ENOMEM_cpu_replicas; + goto err; + } + + ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); + if (ret) + goto err; + + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &new_r)); + } + + if (!new_r.entries && + !new_gc.entries) + goto out; + + /* allocations done, now commit: */ + + if (new_r.entries) + bch2_write_super(c); + + /* don't update in memory replicas until changes are persistent */ + percpu_down_write(&c->mark_lock); + if (new_r.entries) + ret = replicas_table_update(c, &new_r); + if (new_gc.entries) + swap(new_gc, c->replicas_gc); + percpu_up_write(&c->mark_lock); +out: + mutex_unlock(&c->sb_lock); + + kfree(new_r.entries); + kfree(new_gc.entries); + + return ret; +err: + bch_err_msg(c, ret, "adding replicas entry"); + goto out; +} + +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +{ + return likely(bch2_replicas_marked(c, r)) + ? 0 : bch2_mark_replicas_slowpath(c, r); +} + +/* replicas delta list: */ + +int bch2_replicas_delta_list_mark(struct bch_fs *c, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + int ret = 0; + + for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) + ret = bch2_mark_replicas(c, &d->r); + return ret; +} + +/* + * Old replicas_gc mechanism: only used for journal replicas entries now, should + * die at some point: + */ + +int bch2_replicas_gc_end(struct bch_fs *c, int ret) +{ + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + + ret = ret ?: + bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: + replicas_table_update(c, &c->replicas_gc); + + kfree(c->replicas_gc.entries); + c->replicas_gc.entries = NULL; + + percpu_up_write(&c->mark_lock); + + if (!ret) + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) +{ + struct bch_replicas_entry *e; + unsigned i = 0; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + BUG_ON(c->replicas_gc.entries); + + c->replicas_gc.nr = 0; + c->replicas_gc.entry_size = 0; + + for_each_cpu_replicas_entry(&c->replicas, e) + if (!((1 << e->data_type) & typemask)) { + c->replicas_gc.nr++; + c->replicas_gc.entry_size = + max_t(unsigned, c->replicas_gc.entry_size, + replicas_entry_bytes(e)); + } + + c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, + c->replicas_gc.entry_size, + GFP_KERNEL); + if (!c->replicas_gc.entries) { + mutex_unlock(&c->sb_lock); + bch_err(c, "error allocating c->replicas_gc"); + return -BCH_ERR_ENOMEM_replicas_gc; + } + + for_each_cpu_replicas_entry(&c->replicas, e) + if (!((1 << e->data_type) & typemask)) + memcpy(cpu_replicas_entry(&c->replicas_gc, i++), + e, c->replicas_gc.entry_size); + + bch2_cpu_replicas_sort(&c->replicas_gc); + mutex_unlock(&c->sb_lock); + + return 0; +} + +/* + * New much simpler mechanism for clearing out unneeded replicas entries - drop + * replicas entries that have 0 sectors used. + * + * However, we don't track sector counts for journal usage, so this doesn't drop + * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism + * is retained for that. + */ +int bch2_replicas_gc2(struct bch_fs *c) +{ + struct bch_replicas_cpu new = { 0 }; + unsigned i, nr; + int ret = 0; + + bch2_journal_meta(&c->journal); +retry: + nr = READ_ONCE(c->replicas.nr); + new.entry_size = READ_ONCE(c->replicas.entry_size); + new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); + if (!new.entries) { + bch_err(c, "error allocating c->replicas_gc"); + return -BCH_ERR_ENOMEM_replicas_gc; + } + + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + + if (nr != c->replicas.nr || + new.entry_size != c->replicas.entry_size) { + percpu_up_write(&c->mark_lock); + mutex_unlock(&c->sb_lock); + kfree(new.entries); + goto retry; + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + if (e->data_type == BCH_DATA_journal || + c->usage_base->replicas[i] || + percpu_u64_get(&c->usage[0]->replicas[i]) || + percpu_u64_get(&c->usage[1]->replicas[i]) || + percpu_u64_get(&c->usage[2]->replicas[i]) || + percpu_u64_get(&c->usage[3]->replicas[i])) + memcpy(cpu_replicas_entry(&new, new.nr++), + e, new.entry_size); + } + + bch2_cpu_replicas_sort(&new); + + ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: + replicas_table_update(c, &new); + + kfree(new.entries); + + percpu_up_write(&c->mark_lock); + + if (!ret) + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_replicas_set_usage(struct bch_fs *c, + struct bch_replicas_entry *r, + u64 sectors) +{ + int ret, idx = bch2_replicas_entry_idx(c, r); + + if (idx < 0) { + struct bch_replicas_cpu n; + + n = cpu_replicas_add_entry(c, &c->replicas, r); + if (!n.entries) + return -BCH_ERR_ENOMEM_cpu_replicas; + + ret = replicas_table_update(c, &n); + if (ret) + return ret; + + kfree(n.entries); + + idx = bch2_replicas_entry_idx(c, r); + BUG_ON(ret < 0); + } + + c->usage_base->replicas[idx] = sectors; + + return 0; +} + +/* Replicas tracking - superblock: */ + +static int +__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, + struct bch_replicas_cpu *cpu_r) +{ + struct bch_replicas_entry *e, *dst; + unsigned nr = 0, entry_size = 0, idx = 0; + + for_each_replicas_entry(sb_r, e) { + entry_size = max_t(unsigned, entry_size, + replicas_entry_bytes(e)); + nr++; + } + + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); + if (!cpu_r->entries) + return -BCH_ERR_ENOMEM_cpu_replicas; + + cpu_r->nr = nr; + cpu_r->entry_size = entry_size; + + for_each_replicas_entry(sb_r, e) { + dst = cpu_replicas_entry(cpu_r, idx++); + memcpy(dst, e, replicas_entry_bytes(e)); + bch2_replicas_entry_sort(dst); + } + + return 0; +} + +static int +__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, + struct bch_replicas_cpu *cpu_r) +{ + struct bch_replicas_entry_v0 *e; + unsigned nr = 0, entry_size = 0, idx = 0; + + for_each_replicas_entry(sb_r, e) { + entry_size = max_t(unsigned, entry_size, + replicas_entry_bytes(e)); + nr++; + } + + entry_size += sizeof(struct bch_replicas_entry) - + sizeof(struct bch_replicas_entry_v0); + + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); + if (!cpu_r->entries) + return -BCH_ERR_ENOMEM_cpu_replicas; + + cpu_r->nr = nr; + cpu_r->entry_size = entry_size; + + for_each_replicas_entry(sb_r, e) { + struct bch_replicas_entry *dst = + cpu_replicas_entry(cpu_r, idx++); + + dst->data_type = e->data_type; + dst->nr_devs = e->nr_devs; + dst->nr_required = 1; + memcpy(dst->devs, e->devs, e->nr_devs); + bch2_replicas_entry_sort(dst); + } + + return 0; +} + +int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) +{ + struct bch_sb_field_replicas *sb_v1; + struct bch_sb_field_replicas_v0 *sb_v0; + struct bch_replicas_cpu new_r = { 0, 0, NULL }; + int ret = 0; + + if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) + ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); + else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) + ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); + if (ret) + return ret; + + bch2_cpu_replicas_sort(&new_r); + + percpu_down_write(&c->mark_lock); + + ret = replicas_table_update(c, &new_r); + percpu_up_write(&c->mark_lock); + + kfree(new_r.entries); + + return 0; +} + +static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_sb_field_replicas_v0 *sb_r; + struct bch_replicas_entry_v0 *dst; + struct bch_replicas_entry *src; + size_t bytes; + + bytes = sizeof(struct bch_sb_field_replicas); + + for_each_cpu_replicas_entry(r, src) + bytes += replicas_entry_bytes(src) - 1; + + sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, + DIV_ROUND_UP(bytes, sizeof(u64))); + if (!sb_r) + return -BCH_ERR_ENOSPC_sb_replicas; + + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); + sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); + + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); + + dst = sb_r->entries; + for_each_cpu_replicas_entry(r, src) { + dst->data_type = src->data_type; + dst->nr_devs = src->nr_devs; + memcpy(dst->devs, src->devs, src->nr_devs); + + dst = replicas_entry_next(dst); + + BUG_ON((void *) dst > vstruct_end(&sb_r->field)); + } + + return 0; +} + +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_entry *dst, *src; + bool need_v1 = false; + size_t bytes; + + bytes = sizeof(struct bch_sb_field_replicas); + + for_each_cpu_replicas_entry(r, src) { + bytes += replicas_entry_bytes(src); + if (src->nr_required != 1) + need_v1 = true; + } + + if (!need_v1) + return bch2_cpu_replicas_to_sb_replicas_v0(c, r); + + sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, + DIV_ROUND_UP(bytes, sizeof(u64))); + if (!sb_r) + return -BCH_ERR_ENOSPC_sb_replicas; + + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); + sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); + + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); + + dst = sb_r->entries; + for_each_cpu_replicas_entry(r, src) { + memcpy(dst, src, replicas_entry_bytes(src)); + + dst = replicas_entry_next(dst); + + BUG_ON((void *) dst > vstruct_end(&sb_r->field)); + } + + return 0; +} + +static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, + struct bch_sb *sb, + struct printbuf *err) +{ + unsigned i; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + + for (i = 0; i < cpu_r->nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(cpu_r, i); + + int ret = bch2_replicas_entry_validate(e, sb, err); + if (ret) + return ret; + + if (i + 1 < cpu_r->nr) { + struct bch_replicas_entry *n = + cpu_replicas_entry(cpu_r, i + 1); + + BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); + + if (!memcmp(e, n, cpu_r->entry_size)) { + prt_printf(err, "duplicate replicas entry "); + bch2_replicas_entry_to_text(err, e); + return -BCH_ERR_invalid_sb_replicas; + } + } + } + + return 0; +} + +static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); + struct bch_replicas_cpu cpu_r; + int ret; + + ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); + if (ret) + return ret; + + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); + return ret; +} + +static void bch2_sb_replicas_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_replicas *r = field_to_type(f, replicas); + struct bch_replicas_entry *e; + bool first = true; + + for_each_replicas_entry(r, e) { + if (!first) + prt_printf(out, " "); + first = false; + + bch2_replicas_entry_to_text(out, e); + } + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_replicas = { + .validate = bch2_sb_replicas_validate, + .to_text = bch2_sb_replicas_to_text, +}; + +static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); + struct bch_replicas_cpu cpu_r; + int ret; + + ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); + if (ret) + return ret; + + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); + return ret; +} + +static void bch2_sb_replicas_v0_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); + struct bch_replicas_entry_v0 *e; + bool first = true; + + for_each_replicas_entry(sb_r, e) { + if (!first) + prt_printf(out, " "); + first = false; + + bch2_replicas_entry_v0_to_text(out, e); + } + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { + .validate = bch2_sb_replicas_v0_validate, + .to_text = bch2_sb_replicas_v0_to_text, +}; + +/* Query replicas: */ + +bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned flags, bool print) +{ + struct bch_replicas_entry *e; + bool ret = true; + + percpu_down_read(&c->mark_lock); + for_each_cpu_replicas_entry(&c->replicas, e) { + unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + bool metadata = e->data_type < BCH_DATA_user; + + if (e->data_type == BCH_DATA_cached) + continue; + + for (i = 0; i < e->nr_devs; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); + + nr_online += test_bit(e->devs[i], devs.d); + nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + } + + if (nr_failed == e->nr_devs) + continue; + + if (nr_online < e->nr_required) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_LOST + : BCH_FORCE_IF_DATA_LOST; + + if (nr_online < e->nr_devs) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_DEGRADED + : BCH_FORCE_IF_DATA_DEGRADED; + + if (dflags & ~flags) { + if (print) { + struct printbuf buf = PRINTBUF; + + bch2_replicas_entry_to_text(&buf, e); + bch_err(c, "insufficient devices online (%u) for replicas entry %s", + nr_online, buf.buf); + printbuf_exit(&buf); + } + ret = false; + break; + } + + } + percpu_up_read(&c->mark_lock); + + return ret; +} + +unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) +{ + struct bch_sb_field_replicas *replicas; + struct bch_sb_field_replicas_v0 *replicas_v0; + unsigned i, data_has = 0; + + replicas = bch2_sb_field_get(sb, replicas); + replicas_v0 = bch2_sb_field_get(sb, replicas_v0); + + if (replicas) { + struct bch_replicas_entry *r; + + for_each_replicas_entry(replicas, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } else if (replicas_v0) { + struct bch_replicas_entry_v0 *r; + + for_each_replicas_entry_v0(replicas_v0, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } + + + return data_has; +} + +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned ret; + + mutex_lock(&c->sb_lock); + ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); + mutex_unlock(&c->sb_lock); + + return ret; +} + +void bch2_fs_replicas_exit(struct bch_fs *c) +{ + unsigned i; + + kfree(c->usage_scratch); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + free_percpu(c->usage[i]); + kfree(c->usage_base); + kfree(c->replicas.entries); + kfree(c->replicas_gc.entries); + + mempool_exit(&c->replicas_delta_pool); +} + +int bch2_fs_replicas_init(struct bch_fs *c) +{ + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &c->replicas)); + + return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, + REPLICAS_DELTA_LIST_MAX) ?: + replicas_table_update(c, &c->replicas); +} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h new file mode 100644 index 000000000000..f70a642775d1 --- /dev/null +++ b/fs/bcachefs/replicas.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_H +#define _BCACHEFS_REPLICAS_H + +#include "bkey.h" +#include "eytzinger.h" +#include "replicas_types.h" + +void bch2_replicas_entry_sort(struct bch_replicas_entry *); +void bch2_replicas_entry_to_text(struct printbuf *, + struct bch_replicas_entry *); +int bch2_replicas_entry_validate(struct bch_replicas_entry *, + struct bch_sb *, struct printbuf *); +void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); + +static inline struct bch_replicas_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + +int bch2_replicas_entry_idx(struct bch_fs *, + struct bch_replicas_entry *); + +void bch2_devlist_to_replicas(struct bch_replicas_entry *, + enum bch_data_type, + struct bch_devs_list); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); +int bch2_mark_replicas(struct bch_fs *, + struct bch_replicas_entry *); + +static inline struct replicas_delta * +replicas_delta_next(struct replicas_delta *d) +{ + return (void *) d + replicas_entry_bytes(&d->r) + 8; +} + +int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); + +void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); + +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) +{ + e->data_type = BCH_DATA_cached; + e->nr_devs = 1; + e->nr_required = 1; + e->devs[0] = dev; +} + +bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, + unsigned, bool); + +unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); +unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); + +int bch2_replicas_gc_end(struct bch_fs *, int); +int bch2_replicas_gc_start(struct bch_fs *, unsigned); +int bch2_replicas_gc2(struct bch_fs *); + +int bch2_replicas_set_usage(struct bch_fs *, + struct bch_replicas_entry *, + u64); + +#define for_each_cpu_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ + _i = (void *) (_i) + (_r)->entry_size) + +/* iterate over superblock replicas - used by userspace tools: */ + +#define replicas_entry_next(_i) \ + ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) + +#define for_each_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + +#define for_each_replicas_entry_v0(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + +int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; +extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; + +void bch2_fs_replicas_exit(struct bch_fs *); +int bch2_fs_replicas_init(struct bch_fs *); + +#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h new file mode 100644 index 000000000000..5cfff489bbc3 --- /dev/null +++ b/fs/bcachefs/replicas_types.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_TYPES_H +#define _BCACHEFS_REPLICAS_TYPES_H + +struct bch_replicas_cpu { + unsigned nr; + unsigned entry_size; + struct bch_replicas_entry *entries; +}; + +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +} __packed; + +struct replicas_delta_list { + unsigned size; + unsigned used; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; + struct replicas_delta d[0]; +}; + +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c new file mode 100644 index 000000000000..e151ada1c8bd --- /dev/null +++ b/fs/bcachefs/sb-clean.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "replicas.h" +#include "sb-clean.h" +#include "super-io.h" + +/* + * BCH_SB_FIELD_clean: + * + * Btree roots, and a few other things, are recovered from the journal after an + * unclean shutdown - but after a clean shutdown, to avoid having to read the + * journal, we can store them in the superblock. + * + * bch_sb_field_clean simply contains a list of journal entries, stored exactly + * as they would be in the journal: + */ + +int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, + int write) +{ + struct jset_entry *entry; + int ret; + + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = bch2_journal_entry_validate(c, NULL, entry, + le16_to_cpu(c->disk_sb.sb->version), + BCH_SB_BIG_ENDIAN(c->disk_sb.sb), + write); + if (ret) + return ret; + } + + return 0; +} + +static struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +int bch2_verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + sb_clean_journal_seq_mismatch, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + if (k1) + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); + else + prt_printf(&buf1, "(none)"); + + if (k2) + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); + else + prt_printf(&buf2, "(none)"); + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(&k1->k)) || + l1 != l2, c, + sb_clean_btree_root_mismatch, + "superblock btree root %u doesn't match journal after clean shutdown\n" + "sb: l=%u %s\n" + "journal: l=%u %s\n", i, + l1, buf1.buf, + l2, buf2.buf); + } +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); + + if (fsck_err_on(!sb_clean, c, + sb_clean_missing, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); + } + + ret = bch2_sb_clean_validate_late(c, clean, READ); + if (ret) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); + } + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) +{ + struct bch_dev *ca; + unsigned i, dev; + + percpu_down_read(&c->mark_lock); + + if (!journal_seq) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + } else { + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_inodes; + u->v = cpu_to_le64(c->usage_base->nr_inodes); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_key_version; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); + } + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_reserved; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct jset_entry_data_usage *u = + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_data_usage; + u->v = cpu_to_le64(c->usage_base->replicas[i]); + unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), + "embedded variable length struct"); + } + + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } + } + + percpu_up_read(&c->mark_lock); + + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); + } +} + +static int bch2_sb_clean_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&clean->field), sizeof(*clean)); + return -BCH_ERR_invalid_sb_clean; + } + + return 0; +} + +static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + struct jset_entry *entry; + + prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); + prt_newline(out); + prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); + prt_newline(out); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + + bch2_journal_entry_to_text(out, NULL, entry); + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_clean_validate, + .to_text = bch2_sb_clean_to_text, +}; + +int bch2_fs_mark_dirty(struct bch_fs *c) +{ + int ret; + + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_sb_maybe_downgrade(c); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +void bch2_fs_mark_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *sb_clean; + struct jset_entry *entry; + unsigned u64s; + int ret; + + mutex_lock(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + + sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } + + sb_clean->flags = 0; + sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); + + entry = sb_clean->start; + bch2_journal_super_entries_add_common(c, &entry, 0); + entry = bch2_btree_roots_to_journal_entries(c, entry, 0); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + + /* + * this should be in the write path, and we should be validating every + * superblock section: + */ + ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; + } + + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h new file mode 100644 index 000000000000..71caef281239 --- /dev/null +++ b/fs/bcachefs/sb-clean.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_CLEAN_H +#define _BCACHEFS_SB_CLEAN_H + +int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); +int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, + struct jset *); +struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); +void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); + +extern const struct bch_sb_field_ops bch_sb_field_ops_clean; + +int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_clean(struct bch_fs *); + +#endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c new file mode 100644 index 000000000000..f0930ab7f036 --- /dev/null +++ b/fs/bcachefs/sb-errors.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "sb-errors.h" +#include "super-io.h" + +static const char * const bch2_sb_error_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_SB_ERRS() + NULL +}; + +static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) +{ + if (id < BCH_SB_ERR_MAX) + prt_str(out, bch2_sb_error_strs[id]); + else + prt_printf(out, "(unknown error %u)", id); +} + +static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) +{ + return e + ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0]) + : 0; +} + +static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) +{ + return (sizeof(struct bch_sb_field_errors) + + sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); +} + +static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_errors *e = field_to_type(f, errors); + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); + + for (i = 0; i < nr; i++) { + if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { + prt_printf(err, "entry with count 0 (id "); + bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); + prt_printf(err, ")"); + return -BCH_ERR_invalid_sb_errors; + } + + if (i + 1 < nr && + BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= + BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { + prt_printf(err, "entries out of order"); + return -BCH_ERR_invalid_sb_errors; + } + } + + return 0; +} + +static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_errors *e = field_to_type(f, errors); + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); + + if (out->nr_tabstops <= 1) + printbuf_tabstop_push(out, 16); + + for (i = 0; i < nr; i++) { + bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); + prt_tab(out); + prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); + prt_tab(out); + bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_errors = { + .validate = bch2_sb_errors_validate, + .to_text = bch2_sb_errors_to_text, +}; + +void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) +{ + bch_sb_errors_cpu *e = &c->fsck_error_counts; + struct bch_sb_error_entry_cpu n = { + .id = err, + .nr = 1, + .last_error_time = ktime_get_real_seconds() + }; + unsigned i; + + mutex_lock(&c->fsck_error_counts_lock); + for (i = 0; i < e->nr; i++) { + if (err == e->data[i].id) { + e->data[i].nr++; + e->data[i].last_error_time = n.last_error_time; + goto out; + } + if (err < e->data[i].id) + break; + } + + if (darray_make_room(e, 1)) + goto out; + + darray_insert_item(e, i, n); +out: + mutex_unlock(&c->fsck_error_counts_lock); +} + +void bch2_sb_errors_from_cpu(struct bch_fs *c) +{ + bch_sb_errors_cpu *src = &c->fsck_error_counts; + struct bch_sb_field_errors *dst = + bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + unsigned i; + + if (!dst) + return; + + for (i = 0; i < src->nr; i++) { + SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); + SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); + dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); + } +} + +static int bch2_sb_errors_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); + bch_sb_errors_cpu *dst = &c->fsck_error_counts; + unsigned i, nr = bch2_sb_field_errors_nr_entries(src); + int ret; + + if (!nr) + return 0; + + mutex_lock(&c->fsck_error_counts_lock); + ret = darray_make_room(dst, nr); + if (ret) + goto err; + + dst->nr = nr; + + for (i = 0; i < nr; i++) { + dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); + dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); + dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); + } +err: + mutex_unlock(&c->fsck_error_counts_lock); + + return ret; +} + +void bch2_fs_sb_errors_exit(struct bch_fs *c) +{ + darray_exit(&c->fsck_error_counts); +} + +void bch2_fs_sb_errors_init_early(struct bch_fs *c) +{ + mutex_init(&c->fsck_error_counts_lock); + darray_init(&c->fsck_error_counts); +} + +int bch2_fs_sb_errors_init(struct bch_fs *c) +{ + return bch2_sb_errors_to_cpu(c); +} diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h new file mode 100644 index 000000000000..5a09a53966be --- /dev/null +++ b/fs/bcachefs/sb-errors.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_H +#define _BCACHEFS_SB_ERRORS_H + +#include "sb-errors_types.h" + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0) \ + x(dirty_but_no_journal_entries, 1) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ + x(sb_clean_journal_seq_mismatch, 3) \ + x(sb_clean_btree_root_mismatch, 4) \ + x(sb_clean_missing, 5) \ + x(jset_unsupported_version, 6) \ + x(jset_unknown_csum, 7) \ + x(jset_last_seq_newer_than_seq, 8) \ + x(jset_past_bucket_end, 9) \ + x(jset_seq_blacklisted, 10) \ + x(journal_entries_missing, 11) \ + x(journal_entry_replicas_not_marked, 12) \ + x(journal_entry_past_jset_end, 13) \ + x(journal_entry_replicas_data_mismatch, 14) \ + x(journal_entry_bkey_u64s_0, 15) \ + x(journal_entry_bkey_past_end, 16) \ + x(journal_entry_bkey_bad_format, 17) \ + x(journal_entry_bkey_invalid, 18) \ + x(journal_entry_btree_root_bad_size, 19) \ + x(journal_entry_blacklist_bad_size, 20) \ + x(journal_entry_blacklist_v2_bad_size, 21) \ + x(journal_entry_blacklist_v2_start_past_end, 22) \ + x(journal_entry_usage_bad_size, 23) \ + x(journal_entry_data_usage_bad_size, 24) \ + x(journal_entry_clock_bad_size, 25) \ + x(journal_entry_clock_bad_rw, 26) \ + x(journal_entry_dev_usage_bad_size, 27) \ + x(journal_entry_dev_usage_bad_dev, 28) \ + x(journal_entry_dev_usage_bad_pad, 29) \ + x(btree_node_unreadable, 30) \ + x(btree_node_fault_injected, 31) \ + x(btree_node_bad_magic, 32) \ + x(btree_node_bad_seq, 33) \ + x(btree_node_unsupported_version, 34) \ + x(btree_node_bset_older_than_sb_min, 35) \ + x(btree_node_bset_newer_than_sb, 36) \ + x(btree_node_data_missing, 37) \ + x(btree_node_bset_after_end, 38) \ + x(btree_node_replicas_sectors_written_mismatch, 39) \ + x(btree_node_replicas_data_mismatch, 40) \ + x(bset_unknown_csum, 41) \ + x(bset_bad_csum, 42) \ + x(bset_past_end_of_btree_node, 43) \ + x(bset_wrong_sector_offset, 44) \ + x(bset_empty, 45) \ + x(bset_bad_seq, 46) \ + x(bset_blacklisted_journal_seq, 47) \ + x(first_bset_blacklisted_journal_seq, 48) \ + x(btree_node_bad_btree, 49) \ + x(btree_node_bad_level, 50) \ + x(btree_node_bad_min_key, 51) \ + x(btree_node_bad_max_key, 52) \ + x(btree_node_bad_format, 53) \ + x(btree_node_bkey_past_bset_end, 54) \ + x(btree_node_bkey_bad_format, 55) \ + x(btree_node_bad_bkey, 56) \ + x(btree_node_bkey_out_of_order, 57) \ + x(btree_root_bkey_invalid, 58) \ + x(btree_root_read_error, 59) \ + x(btree_root_bad_min_key, 50) \ + x(btree_root_bad_max_key, 61) \ + x(btree_node_read_error, 62) \ + x(btree_node_topology_bad_min_key, 63) \ + x(btree_node_topology_bad_max_key, 64) \ + x(btree_node_topology_overwritten_by_prev_node, 65) \ + x(btree_node_topology_overwritten_by_next_node, 66) \ + x(btree_node_topology_interior_node_empty, 67) \ + x(fs_usage_hidden_wrong, 68) \ + x(fs_usage_btree_wrong, 69) \ + x(fs_usage_data_wrong, 70) \ + x(fs_usage_cached_wrong, 71) \ + x(fs_usage_reserved_wrong, 72) \ + x(fs_usage_persistent_reserved_wrong, 73) \ + x(fs_usage_nr_inodes_wrong, 74) \ + x(fs_usage_replicas_wrong, 75) \ + x(dev_usage_buckets_wrong, 76) \ + x(dev_usage_sectors_wrong, 77) \ + x(dev_usage_fragmented_wrong, 78) \ + x(dev_usage_buckets_ec_wrong, 79) \ + x(bkey_version_in_future, 80) \ + x(bkey_u64s_too_small, 81) \ + x(bkey_invalid_type_for_btree, 82) \ + x(bkey_extent_size_zero, 83) \ + x(bkey_extent_size_greater_than_offset, 84) \ + x(bkey_size_nonzero, 85) \ + x(bkey_snapshot_nonzero, 86) \ + x(bkey_snapshot_zero, 87) \ + x(bkey_at_pos_max, 88) \ + x(bkey_before_start_of_btree_node, 89) \ + x(bkey_after_end_of_btree_node, 90) \ + x(bkey_val_size_nonzero, 91) \ + x(bkey_val_size_too_small, 92) \ + x(alloc_v1_val_size_bad, 93) \ + x(alloc_v2_unpack_error, 94) \ + x(alloc_v3_unpack_error, 95) \ + x(alloc_v4_val_size_bad, 96) \ + x(alloc_v4_backpointers_start_bad, 97) \ + x(alloc_key_data_type_bad, 98) \ + x(alloc_key_empty_but_have_data, 99) \ + x(alloc_key_dirty_sectors_0, 100) \ + x(alloc_key_data_type_inconsistency, 101) \ + x(alloc_key_to_missing_dev_bucket, 102) \ + x(alloc_key_cached_inconsistency, 103) \ + x(alloc_key_cached_but_read_time_zero, 104) \ + x(alloc_key_to_missing_lru_entry, 105) \ + x(alloc_key_data_type_wrong, 106) \ + x(alloc_key_gen_wrong, 107) \ + x(alloc_key_dirty_sectors_wrong, 108) \ + x(alloc_key_cached_sectors_wrong, 109) \ + x(alloc_key_stripe_wrong, 110) \ + x(alloc_key_stripe_redundancy_wrong, 111) \ + x(bucket_sector_count_overflow, 112) \ + x(bucket_metadata_type_mismatch, 113) \ + x(need_discard_key_wrong, 114) \ + x(freespace_key_wrong, 115) \ + x(freespace_hole_missing, 116) \ + x(bucket_gens_val_size_bad, 117) \ + x(bucket_gens_key_wrong, 118) \ + x(bucket_gens_hole_wrong, 119) \ + x(bucket_gens_to_invalid_dev, 120) \ + x(bucket_gens_to_invalid_buckets, 121) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ + x(need_discard_freespace_key_bad, 124) \ + x(backpointer_pos_wrong, 125) \ + x(backpointer_to_missing_device, 126) \ + x(backpointer_to_missing_alloc, 127) \ + x(backpointer_to_missing_ptr, 128) \ + x(lru_entry_at_time_0, 129) \ + x(lru_entry_to_invalid_bucket, 130) \ + x(lru_entry_bad, 131) \ + x(btree_ptr_val_too_big, 132) \ + x(btree_ptr_v2_val_too_big, 133) \ + x(btree_ptr_has_non_ptr, 134) \ + x(extent_ptrs_invalid_entry, 135) \ + x(extent_ptrs_no_ptrs, 136) \ + x(extent_ptrs_too_many_ptrs, 137) \ + x(extent_ptrs_redundant_crc, 138) \ + x(extent_ptrs_redundant_stripe, 139) \ + x(extent_ptrs_unwritten, 140) \ + x(extent_ptrs_written_and_unwritten, 141) \ + x(ptr_to_invalid_device, 142) \ + x(ptr_to_duplicate_device, 143) \ + x(ptr_after_last_bucket, 144) \ + x(ptr_before_first_bucket, 145) \ + x(ptr_spans_multiple_buckets, 146) \ + x(ptr_to_missing_backpointer, 147) \ + x(ptr_to_missing_alloc_key, 148) \ + x(ptr_to_missing_replicas_entry, 149) \ + x(ptr_to_missing_stripe, 150) \ + x(ptr_to_incorrect_stripe, 151) \ + x(ptr_gen_newer_than_bucket_gen, 152) \ + x(ptr_too_stale, 153) \ + x(stale_dirty_ptr, 154) \ + x(ptr_bucket_data_type_mismatch, 155) \ + x(ptr_cached_and_erasure_coded, 156) \ + x(ptr_crc_uncompressed_size_too_small, 157) \ + x(ptr_crc_csum_type_unknown, 158) \ + x(ptr_crc_compression_type_unknown, 159) \ + x(ptr_crc_redundant, 160) \ + x(ptr_crc_uncompressed_size_too_big, 161) \ + x(ptr_crc_nonce_mismatch, 162) \ + x(ptr_stripe_redundant, 163) \ + x(reservation_key_nr_replicas_invalid, 164) \ + x(reflink_v_refcount_wrong, 165) \ + x(reflink_p_to_missing_reflink_v, 166) \ + x(stripe_pos_bad, 167) \ + x(stripe_val_size_bad, 168) \ + x(stripe_sector_count_wrong, 169) \ + x(snapshot_tree_pos_bad, 170) \ + x(snapshot_tree_to_missing_snapshot, 171) \ + x(snapshot_tree_to_missing_subvol, 172) \ + x(snapshot_tree_to_wrong_subvol, 173) \ + x(snapshot_tree_to_snapshot_subvol, 174) \ + x(snapshot_pos_bad, 175) \ + x(snapshot_parent_bad, 176) \ + x(snapshot_children_not_normalized, 177) \ + x(snapshot_child_duplicate, 178) \ + x(snapshot_child_bad, 179) \ + x(snapshot_skiplist_not_normalized, 180) \ + x(snapshot_skiplist_bad, 181) \ + x(snapshot_should_not_have_subvol, 182) \ + x(snapshot_to_bad_snapshot_tree, 183) \ + x(snapshot_bad_depth, 184) \ + x(snapshot_bad_skiplist, 185) \ + x(subvol_pos_bad, 186) \ + x(subvol_not_master_and_not_snapshot, 187) \ + x(subvol_to_missing_root, 188) \ + x(subvol_root_wrong_bi_subvol, 189) \ + x(bkey_in_missing_snapshot, 190) \ + x(inode_pos_inode_nonzero, 191) \ + x(inode_pos_blockdev_range, 192) \ + x(inode_unpack_error, 193) \ + x(inode_str_hash_invalid, 194) \ + x(inode_v3_fields_start_bad, 195) \ + x(inode_snapshot_mismatch, 196) \ + x(inode_unlinked_but_clean, 197) \ + x(inode_unlinked_but_nlink_nonzero, 198) \ + x(inode_checksum_type_invalid, 199) \ + x(inode_compression_type_invalid, 200) \ + x(inode_subvol_root_but_not_dir, 201) \ + x(inode_i_size_dirty_but_clean, 202) \ + x(inode_i_sectors_dirty_but_clean, 203) \ + x(inode_i_sectors_wrong, 204) \ + x(inode_dir_wrong_nlink, 205) \ + x(inode_dir_multiple_links, 206) \ + x(inode_multiple_links_but_nlink_0, 207) \ + x(inode_wrong_backpointer, 208) \ + x(inode_wrong_nlink, 209) \ + x(inode_unreachable, 210) \ + x(deleted_inode_but_clean, 211) \ + x(deleted_inode_missing, 212) \ + x(deleted_inode_is_dir, 213) \ + x(deleted_inode_not_unlinked, 214) \ + x(extent_overlapping, 215) \ + x(extent_in_missing_inode, 216) \ + x(extent_in_non_reg_inode, 217) \ + x(extent_past_end_of_inode, 218) \ + x(dirent_empty_name, 219) \ + x(dirent_val_too_big, 220) \ + x(dirent_name_too_long, 221) \ + x(dirent_name_embedded_nul, 222) \ + x(dirent_name_dot_or_dotdot, 223) \ + x(dirent_name_has_slash, 224) \ + x(dirent_d_type_wrong, 225) \ + x(dirent_d_parent_subvol_wrong, 226) \ + x(dirent_in_missing_dir_inode, 227) \ + x(dirent_in_non_dir_inode, 228) \ + x(dirent_to_missing_inode, 229) \ + x(dirent_to_missing_subvol, 230) \ + x(dirent_to_itself, 231) \ + x(quota_type_invalid, 232) \ + x(xattr_val_size_too_small, 233) \ + x(xattr_val_size_too_big, 234) \ + x(xattr_invalid_type, 235) \ + x(xattr_name_invalid_chars, 236) \ + x(xattr_in_missing_inode, 237) \ + x(root_subvol_missing, 238) \ + x(root_dir_missing, 239) \ + x(root_inode_not_dir, 240) \ + x(dir_loop, 241) \ + x(hash_table_key_duplicate, 242) \ + x(hash_table_key_wrong_offset, 243) + +enum bch_sb_error_id { +#define x(t, n) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + +extern const struct bch_sb_field_ops bch_sb_field_ops_errors; + +void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); + +void bch2_sb_errors_from_cpu(struct bch_fs *); + +void bch2_fs_sb_errors_exit(struct bch_fs *); +void bch2_fs_sb_errors_init_early(struct bch_fs *); +int bch2_fs_sb_errors_init(struct bch_fs *); + +#endif /* _BCACHEFS_SB_ERRORS_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h new file mode 100644 index 000000000000..b1c099843a39 --- /dev/null +++ b/fs/bcachefs/sb-errors_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_TYPES_H +#define _BCACHEFS_SB_ERRORS_TYPES_H + +#include "darray.h" + +struct bch_sb_error_entry_cpu { + u64 id:16, + nr:48; + u64 last_error_time; +}; + +typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; + +#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ + diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c new file mode 100644 index 000000000000..bed0f857fe5b --- /dev/null +++ b/fs/bcachefs/sb-members.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "disk_groups.h" +#include "opts.h" +#include "replicas.h" +#include "sb-members.h" +#include "super-io.h" + +#define x(t, n, ...) [n] = #t, +static const char * const bch2_iops_measurements[] = { + BCH_IOPS_MEASUREMENTS() + NULL +}; + +char * const bch2_member_error_strs[] = { + BCH_MEMBER_ERROR_TYPES() + NULL +}; +#undef x + +/* Code for bch_sb_field_members_v1: */ + +struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) +{ + return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); +} + +static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) +{ + struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); + memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); + return ret; +} + +static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) +{ + return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); +} + +static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i) +{ + struct bch_member ret, *p = members_v1_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); + memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); + return ret; +} + +struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) +{ + struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2); + if (mi2) + return members_v2_get(mi2, i); + struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1); + return members_v1_get(mi1, i); +} + +static int sb_members_v2_resize_entries(struct bch_fs *c) +{ + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + + if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) { + unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) * + c->disk_sb.sb->nr_devices), 8); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) + return -BCH_ERR_ENOSPC_sb_members_v2; + + for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { + void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); + memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); + memset(dst + le16_to_cpu(mi->member_bytes), + 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); + } + mi->member_bytes = cpu_to_le16(sizeof(struct bch_member)); + } + return 0; +} + +int bch2_sb_members_v2_init(struct bch_fs *c) +{ + struct bch_sb_field_members_v1 *mi1; + struct bch_sb_field_members_v2 *mi2; + + if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) { + mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2, + DIV_ROUND_UP(sizeof(*mi2) + + sizeof(struct bch_member) * c->sb.nr_devices, + sizeof(u64))); + mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1); + memcpy(&mi2->_members[0], &mi1->_members[0], + BCH_MEMBER_V1_BYTES * c->sb.nr_devices); + memset(&mi2->pad[0], 0, sizeof(mi2->pad)); + mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES); + } + + return sb_members_v2_resize_entries(c); +} + +int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) +{ + struct bch_sb_field_members_v1 *mi1; + struct bch_sb_field_members_v2 *mi2; + + mi1 = bch2_sb_field_resize(disk_sb, members_v1, + DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * + disk_sb->sb->nr_devices, sizeof(u64))); + if (!mi1) + return -BCH_ERR_ENOSPC_sb_members; + + mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); + + for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) + memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); + + return 0; +} + +static int validate_member(struct printbuf *err, + struct bch_member m, + struct bch_sb *sb, + int i) +{ + if (le64_to_cpu(m.nbuckets) > LONG_MAX) { + prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", + i, le64_to_cpu(m.nbuckets), LONG_MAX); + return -BCH_ERR_invalid_sb_members; + } + + if (le64_to_cpu(m.nbuckets) - + le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) { + prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", + i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m.bucket_size) < + le16_to_cpu(sb->block_size)) { + prt_printf(err, "device %u: bucket size %u smaller than block size %u", + i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size)); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m.bucket_size) < + BCH_SB_BTREE_NODE_SIZE(sb)) { + prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", + i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); + return -BCH_ERR_invalid_sb_members; + } + + return 0; +} + +static void member_to_text(struct printbuf *out, + struct bch_member m, + struct bch_sb_field_disk_groups *gi, + struct bch_sb *sb, + int i) +{ + unsigned data_have = bch2_sb_dev_has_data(sb, i); + u64 bucket_size = le16_to_cpu(m.bucket_size); + u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; + + if (!bch2_member_exists(&m)) + return; + + prt_printf(out, "Device:"); + prt_tab(out); + prt_printf(out, "%u", i); + prt_newline(out); + + printbuf_indent_add(out, 2); + + prt_printf(out, "Label:"); + prt_tab(out); + if (BCH_MEMBER_GROUP(&m)) { + unsigned idx = BCH_MEMBER_GROUP(&m) - 1; + + if (idx < disk_groups_nr(gi)) + prt_printf(out, "%s (%u)", + gi->entries[idx].label, idx); + else + prt_printf(out, "(bad disk labels section)"); + } else { + prt_printf(out, "(none)"); + } + prt_newline(out); + + prt_printf(out, "UUID:"); + prt_tab(out); + pr_uuid(out, m.uuid.b); + prt_newline(out); + + prt_printf(out, "Size:"); + prt_tab(out); + prt_units_u64(out, device_size << 9); + prt_newline(out); + + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s errors:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, le64_to_cpu(m.errors[i])); + prt_newline(out); + } + + for (unsigned i = 0; i < BCH_IOPS_NR; i++) { + prt_printf(out, "%s iops:", bch2_iops_measurements[i]); + prt_tab(out); + prt_printf(out, "%u", le32_to_cpu(m.iops[i])); + prt_newline(out); + } + + prt_printf(out, "Bucket size:"); + prt_tab(out); + prt_units_u64(out, bucket_size << 9); + prt_newline(out); + + prt_printf(out, "First bucket:"); + prt_tab(out); + prt_printf(out, "%u", le16_to_cpu(m.first_bucket)); + prt_newline(out); + + prt_printf(out, "Buckets:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(m.nbuckets)); + prt_newline(out); + + prt_printf(out, "Last mount:"); + prt_tab(out); + if (m.last_mount) + bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); + else + prt_printf(out, "(never)"); + prt_newline(out); + + prt_printf(out, "State:"); + prt_tab(out); + prt_printf(out, "%s", + BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(&m)] + : "unknown"); + prt_newline(out); + + prt_printf(out, "Data allowed:"); + prt_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(&m)) + prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Has data:"); + prt_tab(out); + if (data_have) + prt_bitflags(out, bch2_data_types, data_have); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Discard:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); + prt_newline(out); + + prt_printf(out, "Freespace initialized:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static int bch2_sb_members_v1_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); + unsigned i; + + if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) { + prt_printf(err, "too many devices for section size"); + return -BCH_ERR_invalid_sb_members; + } + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member m = members_v1_get(mi, i); + + int ret = validate_member(err, m, sb, i); + if (ret) + return ret; + } + + return 0; +} + +static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); + unsigned i; + + for (i = 0; i < sb->nr_devices; i++) + member_to_text(out, members_v1_get(mi, i), gi, sb, i); +} + +const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { + .validate = bch2_sb_members_v1_validate, + .to_text = bch2_sb_members_v1_to_text, +}; + +static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); + unsigned i; + + for (i = 0; i < sb->nr_devices; i++) + member_to_text(out, members_v2_get(mi, i), gi, sb, i); +} + +static int bch2_sb_members_v2_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); + size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - + (void *) mi; + + if (mi_bytes > vstruct_bytes(&mi->field)) { + prt_printf(err, "section too small (%zu > %zu)", + mi_bytes, vstruct_bytes(&mi->field)); + return -BCH_ERR_invalid_sb_members; + } + + for (unsigned i = 0; i < sb->nr_devices; i++) { + int ret = validate_member(err, members_v2_get(mi, i), sb, i); + if (ret) + return ret; + } + + return 0; +} + +const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { + .validate = bch2_sb_members_v2_validate, + .to_text = bch2_sb_members_v2_to_text, +}; + +void bch2_sb_members_from_cpu(struct bch_fs *c) +{ + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + struct bch_dev *ca; + unsigned i, e; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + struct bch_member *m = __bch2_members_v2_get_mut(mi, i); + + for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) + m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); + } + rcu_read_unlock(); +} + +void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_member m; + + mutex_lock(&ca->fs->sb_lock); + m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + mutex_unlock(&ca->fs->sb_lock); + + printbuf_tabstop_push(out, 12); + + prt_str(out, "IO errors since filesystem creation"); + prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, atomic64_read(&ca->errors[i])); + prt_newline(out); + } + printbuf_indent_sub(out, 2); + + prt_str(out, "IO errors since "); + bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); + prt_str(out, " ago"); + prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +void bch2_dev_errors_reset(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_member *m; + + mutex_lock(&c->sb_lock); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) + m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); + m->errors_reset_time = ktime_get_real_seconds(); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h new file mode 100644 index 000000000000..03613e3eb8e3 --- /dev/null +++ b/fs/bcachefs/sb-members.h @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_H +#define _BCACHEFS_SB_MEMBERS_H + +extern char * const bch2_member_error_strs[]; + +static inline struct bch_member * +__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) +{ + return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); +} + +int bch2_sb_members_v2_init(struct bch_fs *c); +int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); +struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); +struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); + +static inline bool bch2_dev_is_online(struct bch_dev *ca) +{ + return !percpu_ref_is_zero(&ca->io_ref); +} + +static inline bool bch2_dev_is_readable(struct bch_dev *ca) +{ + return bch2_dev_is_online(ca) && + ca->mi.state != BCH_MEMBER_STATE_failed; +} + +static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) + return true; + + percpu_ref_put(&ca->io_ref); + return false; +} + +static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) +{ + return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); +} + +static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs.nr; i++) + if (devs.devs[i] == dev) + return true; + + return false; +} + +static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs->nr; i++) + if (devs->devs[i] == dev) { + array_remove_item(devs->devs, devs->nr, i); + return; + } +} + +static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, + unsigned dev) +{ + if (!bch2_dev_list_has_dev(*devs, dev)) { + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); + devs->devs[devs->nr++] = dev; + } +} + +static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) +{ + return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; +} + +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, + const struct bch_devs_mask *mask) +{ + struct bch_dev *ca = NULL; + + while ((*iter = mask + ? find_next_bit(mask->d, c->sb.nr_devices, *iter) + : *iter) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[*iter], + lockdep_is_held(&c->state_lock)))) + (*iter)++; + + return ca; +} + +#define for_each_member_device_rcu(ca, c, iter, mask) \ + for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +{ + struct bch_dev *ca; + + rcu_read_lock(); + if ((ca = __bch2_next_dev(c, iter, NULL))) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + return ca; +} + +/* + * If you break early, you must drop your ref on the current device + */ +#define for_each_member_device(ca, c, iter) \ + for ((iter) = 0; \ + (ca = bch2_get_next_dev(c, &(iter))); \ + percpu_ref_put(&ca->ref), (iter)++) + +static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, + unsigned *iter, + int state_mask) +{ + struct bch_dev *ca; + + rcu_read_lock(); + while ((ca = __bch2_next_dev(c, iter, NULL)) && + (!((1 << ca->mi.state) & state_mask) || + !percpu_ref_tryget(&ca->io_ref))) + (*iter)++; + rcu_read_unlock(); + + return ca; +} + +#define __for_each_online_member(ca, c, iter, state_mask) \ + for ((iter) = 0; \ + (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ + percpu_ref_put(&ca->io_ref), (iter)++) + +#define for_each_online_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, ~0) + +#define for_each_rw_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) + +#define for_each_readable_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, \ + (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) + +/* + * If a key exists that references a device, the device won't be going away and + * we can omit rcu_read_lock(): + */ +static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_check(c->devs[idx], 1); +} + +static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_protected(c->devs[idx], + lockdep_is_held(&c->sb_lock) || + lockdep_is_held(&c->state_lock)); +} + +/* XXX kill, move to struct bch_fs */ +static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) +{ + struct bch_devs_mask devs; + struct bch_dev *ca; + unsigned i; + + memset(&devs, 0, sizeof(devs)); + for_each_online_member(ca, c, i) + __set_bit(ca->dev_idx, devs.d); + return devs; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; +extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; + +static inline bool bch2_member_exists(struct bch_member *m) +{ + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); +} + +static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) +{ + if (dev < sb->nr_devices) { + struct bch_member m = bch2_sb_member_get(sb, dev); + return bch2_member_exists(&m); + } + return false; +} + +static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) +{ + return (struct bch_member_cpu) { + .nbuckets = le64_to_cpu(mi->nbuckets), + .first_bucket = le16_to_cpu(mi->first_bucket), + .bucket_size = le16_to_cpu(mi->bucket_size), + .group = BCH_MEMBER_GROUP(mi), + .state = BCH_MEMBER_STATE(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), + .durability = BCH_MEMBER_DURABILITY(mi) + ? BCH_MEMBER_DURABILITY(mi) - 1 + : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .valid = bch2_member_exists(mi), + }; +} + +void bch2_sb_members_from_cpu(struct bch_fs *); + +void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); +void bch2_dev_errors_reset(struct bch_dev *); + +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h new file mode 100644 index 000000000000..c1860d8163fb --- /dev/null +++ b/fs/bcachefs/seqmutex.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SEQMUTEX_H +#define _BCACHEFS_SEQMUTEX_H + +#include <linux/mutex.h> + +struct seqmutex { + struct mutex lock; + u32 seq; +}; + +#define seqmutex_init(_lock) mutex_init(&(_lock)->lock) + +static inline bool seqmutex_trylock(struct seqmutex *lock) +{ + return mutex_trylock(&lock->lock); +} + +static inline void seqmutex_lock(struct seqmutex *lock) +{ + mutex_lock(&lock->lock); +} + +static inline void seqmutex_unlock(struct seqmutex *lock) +{ + lock->seq++; + mutex_unlock(&lock->lock); +} + +static inline u32 seqmutex_seq(struct seqmutex *lock) +{ + return lock->seq; +} + +static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) +{ + if (lock->seq != seq || !mutex_trylock(&lock->lock)) + return false; + + if (lock->seq != seq) { + mutex_unlock(&lock->lock); + return false; + } + + return true; +} + +#endif /* _BCACHEFS_SEQMUTEX_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c new file mode 100644 index 000000000000..dc1a27cc31cd --- /dev/null +++ b/fs/bcachefs/siphash.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ + +/*- + * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d + * are the number of compression rounds and the number of finalization rounds. + * A compression round is identical to a finalization round and this round + * function is called SipRound. Given a 128-bit key k and a (possibly empty) + * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). + * + * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, + * by Jean-Philippe Aumasson and Daniel J. Bernstein, + * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa + * https://131002.net/siphash/siphash.pdf + * https://131002.net/siphash/ + */ + +#include <asm/byteorder.h> +#include <asm/unaligned.h> +#include <linux/bitops.h> +#include <linux/string.h> + +#include "siphash.h" + +static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) +{ + while (rounds--) { + ctx->v[0] += ctx->v[1]; + ctx->v[2] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 13); + ctx->v[3] = rol64(ctx->v[3], 16); + + ctx->v[1] ^= ctx->v[0]; + ctx->v[3] ^= ctx->v[2]; + ctx->v[0] = rol64(ctx->v[0], 32); + + ctx->v[2] += ctx->v[1]; + ctx->v[0] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 17); + ctx->v[3] = rol64(ctx->v[3], 21); + + ctx->v[1] ^= ctx->v[2]; + ctx->v[3] ^= ctx->v[0]; + ctx->v[2] = rol64(ctx->v[2], 32); + } +} + +static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) +{ + u64 m = get_unaligned_le64(ptr); + + ctx->v[3] ^= m; + SipHash_Rounds(ctx, rounds); + ctx->v[0] ^= m; +} + +void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) +{ + u64 k0, k1; + + k0 = le64_to_cpu(key->k0); + k1 = le64_to_cpu(key->k1); + + ctx->v[0] = 0x736f6d6570736575ULL ^ k0; + ctx->v[1] = 0x646f72616e646f6dULL ^ k1; + ctx->v[2] = 0x6c7967656e657261ULL ^ k0; + ctx->v[3] = 0x7465646279746573ULL ^ k1; + + memset(ctx->buf, 0, sizeof(ctx->buf)); + ctx->bytes = 0; +} + +void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, + const void *src, size_t len) +{ + const u8 *ptr = src; + size_t left, used; + + if (len == 0) + return; + + used = ctx->bytes % sizeof(ctx->buf); + ctx->bytes += len; + + if (used > 0) { + left = sizeof(ctx->buf) - used; + + if (len >= left) { + memcpy(&ctx->buf[used], ptr, left); + SipHash_CRounds(ctx, ctx->buf, rc); + len -= left; + ptr += left; + } else { + memcpy(&ctx->buf[used], ptr, len); + return; + } + } + + while (len >= sizeof(ctx->buf)) { + SipHash_CRounds(ctx, ptr, rc); + len -= sizeof(ctx->buf); + ptr += sizeof(ctx->buf); + } + + if (len > 0) + memcpy(&ctx->buf[used], ptr, len); +} + +void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + + r = SipHash_End(ctx, rc, rf); + + *((__le64 *) dst) = cpu_to_le64(r); +} + +u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + size_t left, used; + + used = ctx->bytes % sizeof(ctx->buf); + left = sizeof(ctx->buf) - used; + memset(&ctx->buf[used], 0, left - 1); + ctx->buf[7] = ctx->bytes; + + SipHash_CRounds(ctx, ctx->buf, rc); + ctx->v[2] ^= 0xff; + SipHash_Rounds(ctx, rf); + + r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); + memset(ctx, 0, sizeof(*ctx)); + return r; +} + +u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) +{ + SIPHASH_CTX ctx; + + SipHash_Init(&ctx, key); + SipHash_Update(&ctx, rc, rf, src, len); + return SipHash_End(&ctx, rc, rf); +} diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h new file mode 100644 index 000000000000..3dfaf34a43b2 --- /dev/null +++ b/fs/bcachefs/siphash.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ +/*- + * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) + * optimized for speed on short messages returning a 64bit hash/digest value. + * + * The number of rounds is defined during the initialization: + * SipHash24_Init() for the fast and resonable strong version + * SipHash48_Init() for the strong version (half as fast) + * + * struct SIPHASH_CTX ctx; + * SipHash24_Init(&ctx); + * SipHash_SetKey(&ctx, "16bytes long key"); + * SipHash_Update(&ctx, pointer_to_string, length_of_string); + * SipHash_Final(output, &ctx); + */ + +#ifndef _SIPHASH_H_ +#define _SIPHASH_H_ + +#include <linux/types.h> + +#define SIPHASH_BLOCK_LENGTH 8 +#define SIPHASH_KEY_LENGTH 16 +#define SIPHASH_DIGEST_LENGTH 8 + +typedef struct _SIPHASH_CTX { + u64 v[4]; + u8 buf[SIPHASH_BLOCK_LENGTH]; + u32 bytes; +} SIPHASH_CTX; + +typedef struct { + __le64 k0; + __le64 k1; +} SIPHASH_KEY; + +void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); +void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); +u64 SipHash_End(SIPHASH_CTX *, int, int); +void SipHash_Final(void *, SIPHASH_CTX *, int, int); +u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); + +#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) +#define SipHash24_End(_d) SipHash_End((_d), 2, 4) +#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) +#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) + +#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) +#define SipHash48_End(_d) SipHash_End((_d), 4, 8) +#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) +#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) + +#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c new file mode 100644 index 000000000000..97790445e67a --- /dev/null +++ b/fs/bcachefs/six.c @@ -0,0 +1,920 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/log2.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/sched/rt.h> +#include <linux/sched/task.h> +#include <linux/slab.h> + +#include <trace/events/lock.h> + +#include "six.h" + +#ifdef DEBUG +#define EBUG_ON(cond) BUG_ON(cond) +#else +#define EBUG_ON(cond) do {} while (0) +#endif + +#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) +#define six_release(l, ip) lock_release(l, ip) + +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); + +#define SIX_LOCK_HELD_read_OFFSET 0 +#define SIX_LOCK_HELD_read ~(~0U << 26) +#define SIX_LOCK_HELD_intent (1U << 26) +#define SIX_LOCK_HELD_write (1U << 27) +#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) +#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) +#define SIX_LOCK_NOSPIN (1U << 31) + +struct six_lock_vals { + /* Value we add to the lock in order to take the lock: */ + u32 lock_val; + + /* If the lock has this value (used as a mask), taking the lock fails: */ + u32 lock_fail; + + /* Mask that indicates lock is held for this type: */ + u32 held_mask; + + /* Waitlist we wakeup when releasing the lock: */ + enum six_lock_type unlock_wakeup; +}; + +static const struct six_lock_vals l[] = { + [SIX_LOCK_read] = { + .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, + .lock_fail = SIX_LOCK_HELD_write, + .held_mask = SIX_LOCK_HELD_read, + .unlock_wakeup = SIX_LOCK_write, + }, + [SIX_LOCK_intent] = { + .lock_val = SIX_LOCK_HELD_intent, + .lock_fail = SIX_LOCK_HELD_intent, + .held_mask = SIX_LOCK_HELD_intent, + .unlock_wakeup = SIX_LOCK_intent, + }, + [SIX_LOCK_write] = { + .lock_val = SIX_LOCK_HELD_write, + .lock_fail = SIX_LOCK_HELD_read, + .held_mask = SIX_LOCK_HELD_write, + .unlock_wakeup = SIX_LOCK_read, + }, +}; + +static inline void six_set_bitmask(struct six_lock *lock, u32 mask) +{ + if ((atomic_read(&lock->state) & mask) != mask) + atomic_or(mask, &lock->state); +} + +static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) +{ + if (atomic_read(&lock->state) & mask) + atomic_and(~mask, &lock->state); +} + +static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, + u32 old, struct task_struct *owner) +{ + if (type != SIX_LOCK_intent) + return; + + if (!(old & SIX_LOCK_HELD_intent)) { + EBUG_ON(lock->owner); + lock->owner = owner; + } else { + EBUG_ON(lock->owner != current); + } +} + +static inline unsigned pcpu_read_count(struct six_lock *lock) +{ + unsigned read_count = 0; + int cpu; + + for_each_possible_cpu(cpu) + read_count += *per_cpu_ptr(lock->readers, cpu); + return read_count; +} + +/* + * __do_six_trylock() - main trylock routine + * + * Returns 1 on success, 0 on failure + * + * In percpu reader mode, a failed trylock may cause a spurious trylock failure + * for anoter thread taking the competing lock type, and we may havve to do a + * wakeup: when a wakeup is required, we return -1 - wakeup_type. + */ +static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, + struct task_struct *task, bool try) +{ + int ret; + u32 old; + + EBUG_ON(type == SIX_LOCK_write && lock->owner != task); + EBUG_ON(type == SIX_LOCK_write && + (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); + + /* + * Percpu reader mode: + * + * The basic idea behind this algorithm is that you can implement a lock + * between two threads without any atomics, just memory barriers: + * + * For two threads you'll need two variables, one variable for "thread a + * has the lock" and another for "thread b has the lock". + * + * To take the lock, a thread sets its variable indicating that it holds + * the lock, then issues a full memory barrier, then reads from the + * other thread's variable to check if the other thread thinks it has + * the lock. If we raced, we backoff and retry/sleep. + * + * Failure to take the lock may cause a spurious trylock failure in + * another thread, because we temporarily set the lock to indicate that + * we held it. This would be a problem for a thread in six_lock(), when + * they are calling trylock after adding themself to the waitlist and + * prior to sleeping. + * + * Therefore, if we fail to get the lock, and there were waiters of the + * type we conflict with, we will have to issue a wakeup. + * + * Since we may be called under wait_lock (and by the wakeup code + * itself), we return that the wakeup has to be done instead of doing it + * here. + */ + if (type == SIX_LOCK_read && lock->readers) { + preempt_disable(); + this_cpu_inc(*lock->readers); /* signal that we own lock */ + + smp_mb(); + + old = atomic_read(&lock->state); + ret = !(old & l[type].lock_fail); + + this_cpu_sub(*lock->readers, !ret); + preempt_enable(); + + if (!ret) { + smp_mb(); + if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) + ret = -1 - SIX_LOCK_write; + } + } else if (type == SIX_LOCK_write && lock->readers) { + if (try) { + atomic_add(SIX_LOCK_HELD_write, &lock->state); + smp_mb__after_atomic(); + } + + ret = !pcpu_read_count(lock); + + if (try && !ret) { + old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); + if (old & SIX_LOCK_WAITING_read) + ret = -1 - SIX_LOCK_read; + } + } else { + old = atomic_read(&lock->state); + do { + ret = !(old & l[type].lock_fail); + if (!ret || (type == SIX_LOCK_write && !try)) { + smp_mb(); + break; + } + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); + + EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); + } + + if (ret > 0) + six_set_owner(lock, type, old, task); + + EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && + (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); + + return ret; +} + +static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) +{ + struct six_lock_waiter *w, *next; + struct task_struct *task; + bool saw_one; + int ret; +again: + ret = 0; + saw_one = false; + raw_spin_lock(&lock->wait_lock); + + list_for_each_entry_safe(w, next, &lock->wait_list, list) { + if (w->lock_want != lock_type) + continue; + + if (saw_one && lock_type != SIX_LOCK_read) + goto unlock; + saw_one = true; + + ret = __do_six_trylock(lock, lock_type, w->task, false); + if (ret <= 0) + goto unlock; + + /* + * Similar to percpu_rwsem_wake_function(), we need to guard + * against the wakee noticing w->lock_acquired, returning, and + * then exiting before we do the wakeup: + */ + task = get_task_struct(w->task); + __list_del(w->list.prev, w->list.next); + /* + * The release barrier here ensures the ordering of the + * __list_del before setting w->lock_acquired; @w is on the + * stack of the thread doing the waiting and will be reused + * after it sees w->lock_acquired with no other locking: + * pairs with smp_load_acquire() in six_lock_slowpath() + */ + smp_store_release(&w->lock_acquired, true); + wake_up_process(task); + put_task_struct(task); + } + + six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); +unlock: + raw_spin_unlock(&lock->wait_lock); + + if (ret < 0) { + lock_type = -ret - 1; + goto again; + } +} + +__always_inline +static void six_lock_wakeup(struct six_lock *lock, u32 state, + enum six_lock_type lock_type) +{ + if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) + return; + + if (!(state & (SIX_LOCK_WAITING_read << lock_type))) + return; + + __six_lock_wakeup(lock, lock_type); +} + +__always_inline +static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) +{ + int ret; + + ret = __do_six_trylock(lock, type, current, try); + if (ret < 0) + __six_lock_wakeup(lock, -ret - 1); + + return ret > 0; +} + +/** + * six_trylock_ip - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) +{ + if (!do_six_trylock(lock, type, true)) + return false; + + if (type != SIX_LOCK_write) + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); + return true; +} +EXPORT_SYMBOL_GPL(six_trylock_ip); + +/** + * six_relock_ip - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip) +{ + if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) + return false; + + if (six_lock_seq(lock) != seq) { + six_unlock_ip(lock, type, ip); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(six_relock_ip); + +#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER + +static inline bool six_can_spin_on_owner(struct six_lock *lock) +{ + struct task_struct *owner; + bool ret; + + if (need_resched()) + return false; + + rcu_read_lock(); + owner = READ_ONCE(lock->owner); + ret = !owner || owner_on_cpu(owner); + rcu_read_unlock(); + + return ret; +} + +static inline bool six_spin_on_owner(struct six_lock *lock, + struct task_struct *owner, + u64 end_time) +{ + bool ret = true; + unsigned loop = 0; + + rcu_read_lock(); + while (lock->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner. If that fails, + * owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (!owner_on_cpu(owner) || need_resched()) { + ret = false; + break; + } + + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); + ret = false; + break; + } + + cpu_relax(); + } + rcu_read_unlock(); + + return ret; +} + +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +{ + struct task_struct *task = current; + u64 end_time; + + if (type == SIX_LOCK_write) + return false; + + preempt_disable(); + if (!six_can_spin_on_owner(lock)) + goto fail; + + if (!osq_lock(&lock->osq)) + goto fail; + + end_time = sched_clock() + 10 * NSEC_PER_USEC; + + while (1) { + struct task_struct *owner; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = READ_ONCE(lock->owner); + if (owner && !six_spin_on_owner(lock, owner, end_time)) + break; + + if (do_six_trylock(lock, type, false)) { + osq_unlock(&lock->osq); + preempt_enable(); + return true; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + + osq_unlock(&lock->osq); +fail: + preempt_enable(); + + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock again. This avoids getting + * scheduled out right after we obtained the lock. + */ + if (need_resched()) + schedule(); + + return false; +} + +#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */ + +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +{ + return false; +} + +#endif + +noinline +static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + int ret = 0; + + if (type == SIX_LOCK_write) { + EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + atomic_add(SIX_LOCK_HELD_write, &lock->state); + smp_mb__after_atomic(); + } + + trace_contention_begin(lock, 0); + lock_contended(&lock->dep_map, ip); + + if (six_optimistic_spin(lock, type)) + goto out; + + wait->task = current; + wait->lock_want = type; + wait->lock_acquired = false; + + raw_spin_lock(&lock->wait_lock); + six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); + /* + * Retry taking the lock after taking waitlist lock, in case we raced + * with an unlock: + */ + ret = __do_six_trylock(lock, type, current, false); + if (ret <= 0) { + wait->start_time = local_clock(); + + if (!list_empty(&lock->wait_list)) { + struct six_lock_waiter *last = + list_last_entry(&lock->wait_list, + struct six_lock_waiter, list); + + if (time_before_eq64(wait->start_time, last->start_time)) + wait->start_time = last->start_time + 1; + } + + list_add_tail(&wait->list, &lock->wait_list); + } + raw_spin_unlock(&lock->wait_lock); + + if (unlikely(ret > 0)) { + ret = 0; + goto out; + } + + if (unlikely(ret < 0)) { + __six_lock_wakeup(lock, -ret - 1); + ret = 0; + } + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + /* + * Ensures that writes to the waitlist entry happen after we see + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup + */ + if (smp_load_acquire(&wait->lock_acquired)) + break; + + ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; + if (unlikely(ret)) { + bool acquired; + + /* + * If should_sleep_fn() returns an error, we are + * required to return that error even if we already + * acquired the lock - should_sleep_fn() might have + * modified external state (e.g. when the deadlock cycle + * detector in bcachefs issued a transaction restart) + */ + raw_spin_lock(&lock->wait_lock); + acquired = wait->lock_acquired; + if (!acquired) + list_del(&wait->list); + raw_spin_unlock(&lock->wait_lock); + + if (unlikely(acquired)) + do_six_unlock_type(lock, type); + break; + } + + schedule(); + } + + __set_current_state(TASK_RUNNING); +out: + if (ret && type == SIX_LOCK_write) { + six_clear_bitmask(lock, SIX_LOCK_HELD_write); + six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); + } + trace_contention_end(lock, 0); + + return ret; +} + +/** + * six_lock_ip_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * This is the most general six_lock() variant, with parameters to support full + * cycle detection for deadlock avoidance. + * + * The code calling this function must implement tracking of held locks, and the + * @wait object should be embedded into the struct that tracks held locks - + * which must also be accessible in a thread-safe way. + * + * @should_sleep_fn should invoke the cycle detector; it should walk each + * lock's waiters, and for each waiter recursively walk their held locks. + * + * When this function must block, @wait will be added to @lock's waitlist before + * calling trylock, and before calling @should_sleep_fn, and @wait will not be + * removed from the lock waitlist until the lock has been successfully acquired, + * or we abort. + * + * @wait.start_time will be monotonically increasing for any given waitlist, and + * thus may be used as a loop cursor. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + int ret; + + wait->start_time = 0; + + if (type != SIX_LOCK_write) + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); + + ret = do_six_trylock(lock, type, true) ? 0 + : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); + + if (ret && type != SIX_LOCK_write) + six_release(&lock->dep_map, ip); + if (!ret) + lock_acquired(&lock->dep_map, ip); + + return ret; +} +EXPORT_SYMBOL_GPL(six_lock_ip_waiter); + +__always_inline +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + u32 state; + + if (type == SIX_LOCK_intent) + lock->owner = NULL; + + if (type == SIX_LOCK_read && + lock->readers) { + smp_mb(); /* unlock barrier */ + this_cpu_dec(*lock->readers); + smp_mb(); /* between unlocking and checking for waiters */ + state = atomic_read(&lock->state); + } else { + u32 v = l[type].lock_val; + + if (type != SIX_LOCK_read) + v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; + + EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); + state = atomic_sub_return_release(v, &lock->state); + } + + six_lock_wakeup(lock, state, l[type].unlock_wakeup); +} + +/** + * six_unlock_ip - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) +{ + EBUG_ON(type == SIX_LOCK_write && + !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); + EBUG_ON((type == SIX_LOCK_write || + type == SIX_LOCK_intent) && + lock->owner != current); + + if (type != SIX_LOCK_write) + six_release(&lock->dep_map, ip); + else + lock->seq++; + + if (type == SIX_LOCK_intent && + lock->intent_lock_recurse) { + --lock->intent_lock_recurse; + return; + } + + do_six_unlock_type(lock, type); +} +EXPORT_SYMBOL_GPL(six_unlock_ip); + +/** + * six_lock_downgrade - convert an intent lock to a read lock + * @lock: lock to dowgrade + * + * @lock will have read count incremented and intent count decremented + */ +void six_lock_downgrade(struct six_lock *lock) +{ + six_lock_increment(lock, SIX_LOCK_read); + six_unlock_intent(lock); +} +EXPORT_SYMBOL_GPL(six_lock_downgrade); + +/** + * six_lock_tryupgrade - attempt to convert read lock to an intent lock + * @lock: lock to upgrade + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ +bool six_lock_tryupgrade(struct six_lock *lock) +{ + u32 old = atomic_read(&lock->state), new; + + do { + new = old; + + if (new & SIX_LOCK_HELD_intent) + return false; + + if (!lock->readers) { + EBUG_ON(!(new & SIX_LOCK_HELD_read)); + new -= l[SIX_LOCK_read].lock_val; + } + + new |= SIX_LOCK_HELD_intent; + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); + + if (lock->readers) + this_cpu_dec(*lock->readers); + + six_set_owner(lock, SIX_LOCK_intent, old, current); + + return true; +} +EXPORT_SYMBOL_GPL(six_lock_tryupgrade); + +/** + * six_trylock_convert - attempt to convert a held lock from one type to another + * @lock: lock to upgrade + * @from: SIX_LOCK_read or SIX_LOCK_intent + * @to: SIX_LOCK_read or SIX_LOCK_intent + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ +bool six_trylock_convert(struct six_lock *lock, + enum six_lock_type from, + enum six_lock_type to) +{ + EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); + + if (to == from) + return true; + + if (to == SIX_LOCK_read) { + six_lock_downgrade(lock); + return true; + } else { + return six_lock_tryupgrade(lock); + } +} +EXPORT_SYMBOL_GPL(six_trylock_convert); + +/** + * six_lock_increment - increase held lock count on a lock that is already held + * @lock: lock to increment + * @type: SIX_LOCK_read or SIX_LOCK_intent + * + * @lock must already be held, with a lock type that is greater than or equal to + * @type + * + * A corresponding six_unlock_type() call will be required for @lock to be fully + * unlocked. + */ +void six_lock_increment(struct six_lock *lock, enum six_lock_type type) +{ + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); + + /* XXX: assert already locked, and that we don't overflow: */ + + switch (type) { + case SIX_LOCK_read: + if (lock->readers) { + this_cpu_inc(*lock->readers); + } else { + EBUG_ON(!(atomic_read(&lock->state) & + (SIX_LOCK_HELD_read| + SIX_LOCK_HELD_intent))); + atomic_add(l[type].lock_val, &lock->state); + } + break; + case SIX_LOCK_intent: + EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); + lock->intent_lock_recurse++; + break; + case SIX_LOCK_write: + BUG(); + break; + } +} +EXPORT_SYMBOL_GPL(six_lock_increment); + +/** + * six_lock_wakeup_all - wake up all waiters on @lock + * @lock: lock to wake up waiters for + * + * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then + * abort the lock operation. + * + * This function is never needed in a bug-free program; it's only useful in + * debug code, e.g. to determine if a cycle detector is at fault. + */ +void six_lock_wakeup_all(struct six_lock *lock) +{ + u32 state = atomic_read(&lock->state); + struct six_lock_waiter *w; + + six_lock_wakeup(lock, state, SIX_LOCK_read); + six_lock_wakeup(lock, state, SIX_LOCK_intent); + six_lock_wakeup(lock, state, SIX_LOCK_write); + + raw_spin_lock(&lock->wait_lock); + list_for_each_entry(w, &lock->wait_list, list) + wake_up_process(w->task); + raw_spin_unlock(&lock->wait_lock); +} +EXPORT_SYMBOL_GPL(six_lock_wakeup_all); + +/** + * six_lock_counts - return held lock counts, for each lock type + * @lock: lock to return counters for + * + * Return: the number of times a lock is held for read, intent and write. + */ +struct six_lock_count six_lock_counts(struct six_lock *lock) +{ + struct six_lock_count ret; + + ret.n[SIX_LOCK_read] = !lock->readers + ? atomic_read(&lock->state) & SIX_LOCK_HELD_read + : pcpu_read_count(lock); + ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + + lock->intent_lock_recurse; + ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + + return ret; +} +EXPORT_SYMBOL_GPL(six_lock_counts); + +/** + * six_lock_readers_add - directly manipulate reader count of a lock + * @lock: lock to add/subtract readers for + * @nr: reader count to add/subtract + * + * When an upper layer is implementing lock reentrency, we may have both read + * and intent locks on the same lock. + * + * When we need to take a write lock, the read locks will cause self-deadlock, + * because six locks themselves do not track which read locks are held by the + * current thread and which are held by a different thread - it does no + * per-thread tracking of held locks. + * + * The upper layer that is tracking held locks may however, if trylock() has + * failed, count up its own read locks, subtract them, take the write lock, and + * then re-add them. + * + * As in any other situation when taking a write lock, @lock must be held for + * intent one (or more) times, so @lock will never be left unlocked. + */ +void six_lock_readers_add(struct six_lock *lock, int nr) +{ + if (lock->readers) { + this_cpu_add(*lock->readers, nr); + } else { + EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); + /* reader count starts at bit 0 */ + atomic_add(nr, &lock->state); + } +} +EXPORT_SYMBOL_GPL(six_lock_readers_add); + +/** + * six_lock_exit - release resources held by a lock prior to freeing + * @lock: lock to exit + * + * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is + * required to free the percpu read counts. + */ +void six_lock_exit(struct six_lock *lock) +{ + WARN_ON(lock->readers && pcpu_read_count(lock)); + WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); + + free_percpu(lock->readers); + lock->readers = NULL; +} +EXPORT_SYMBOL_GPL(six_lock_exit); + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags) +{ + atomic_set(&lock->state, 0); + raw_spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *) lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + + /* + * Don't assume that we have real percpu variables available in + * userspace: + */ +#ifdef __KERNEL__ + if (flags & SIX_LOCK_INIT_PCPU) { + /* + * We don't return an error here on memory allocation failure + * since percpu is an optimization, and locks will work with the + * same semantics in non-percpu mode: callers can check for + * failure if they wish by checking lock->readers, but generally + * will not want to treat it as an error. + */ + lock->readers = alloc_percpu(unsigned); + } +#endif +} +EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h new file mode 100644 index 000000000000..4c268b0b8316 --- /dev/null +++ b/fs/bcachefs/six.h @@ -0,0 +1,393 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_SIX_H +#define _LINUX_SIX_H + +/** + * DOC: SIX locks overview + * + * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores + * but with an additional state: read/shared, intent, exclusive/write + * + * The purpose of the intent state is to allow for greater concurrency on tree + * structures without deadlocking. In general, a read can't be upgraded to a + * write lock without deadlocking, so an operation that updates multiple nodes + * will have to take write locks for the full duration of the operation. + * + * But by adding an intent state, which is exclusive with other intent locks but + * not with readers, we can take intent locks at thte start of the operation, + * and then take write locks only for the actual update to each individual + * nodes, without deadlocking. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * + * An intent lock must be held before taking a write lock: + * six_lock_intent(&foo->lock); + * six_lock_write(&foo->lock); + * six_unlock_write(&foo->lock); + * six_unlock_intent(&foo->lock); + * + * Other operations: + * six_trylock_read() + * six_trylock_intent() + * six_trylock_write() + * + * six_lock_downgrade() convert from intent to read + * six_lock_tryupgrade() attempt to convert from read to intent, may fail + * + * There are also interfaces that take the lock type as an enum: + * + * six_lock_type(&foo->lock, SIX_LOCK_read); + * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) + * six_lock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_intent); + * + * Lock sequence numbers - unlock(), relock(): + * + * Locks embed sequences numbers, which are incremented on write lock/unlock. + * This allows locks to be dropped and the retaken iff the state they protect + * hasn't changed; this makes it much easier to avoid holding locks while e.g. + * doing IO or allocating memory. + * + * Example usage: + * six_lock_read(&foo->lock); + * u32 seq = six_lock_seq(&foo->lock); + * six_unlock_read(&foo->lock); + * + * some_operation_that_may_block(); + * + * if (six_relock_read(&foo->lock, seq)) { ... } + * + * If the relock operation succeeds, it is as if the lock was never unlocked. + * + * Reentrancy: + * + * Six locks are not by themselves reentrent, but have counters for both the + * read and intent states that can be used to provide reentrency by an upper + * layer that tracks held locks. If a lock is known to already be held in the + * read or intent state, six_lock_increment() can be used to bump the "lock + * held in this state" counter, increasing the number of unlock calls that + * will be required to fully unlock it. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_lock_increment(&foo->lock, SIX_LOCK_read); + * six_unlock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * foo->lock is now fully unlocked. + * + * Since the intent state supercedes read, it's legal to increment the read + * counter when holding an intent lock, but not the reverse. + * + * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) + * is not legal. + * + * should_sleep_fn: + * + * There is a six_lock() variant that takes a function pointer that is called + * immediately prior to schedule() when blocking, and may return an error to + * abort. + * + * One possible use for this feature is when objects being locked are part of + * a cache and may reused, and lock ordering is based on a property of the + * object that will change when the object is reused - i.e. logical key order. + * + * If looking up an object in the cache may race with object reuse, and lock + * ordering is required to prevent deadlock, object reuse may change the + * correct lock order for that object and cause a deadlock. should_sleep_fn + * can be used to check if the object is still the object we want and avoid + * this deadlock. + * + * Wait list entry interface: + * + * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a + * wait list entry. By embedding six_lock_waiter into another object, and by + * traversing lock waitlists, it is then possible for an upper layer to + * implement full cycle detection for deadlock avoidance. + * + * should_sleep_fn should be used for invoking the cycle detector, walking the + * graph of held locks to check for a deadlock. The upper layer must track + * held locks for each thread, and each thread's held locks must be reachable + * from its six_lock_waiter object. + * + * six_lock_waiter() will add the wait object to the waitlist re-trying taking + * the lock, and before calling should_sleep_fn, and the wait object will not + * be removed from the waitlist until either the lock has been successfully + * acquired, or we aborted because should_sleep_fn returned an error. + * + * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will + * have timestamps in strictly ascending order - this is so the timestamp can + * be used as a cursor for lock graph traverse. + */ + +#include <linux/lockdep.h> +#include <linux/sched.h> +#include <linux/types.h> + +#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER +#include <linux/osq_lock.h> +#endif + +enum six_lock_type { + SIX_LOCK_read, + SIX_LOCK_intent, + SIX_LOCK_write, +}; + +struct six_lock { + atomic_t state; + u32 seq; + unsigned intent_lock_recurse; + struct task_struct *owner; + unsigned __percpu *readers; +#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER + struct optimistic_spin_queue osq; +#endif + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +struct six_lock_waiter { + struct list_head list; + struct task_struct *task; + enum six_lock_type lock_want; + bool lock_acquired; + u64 start_time; +}; + +typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); + +void six_lock_exit(struct six_lock *lock); + +enum six_lock_init_flags { + SIX_LOCK_INIT_PCPU = 1U << 0, +}; + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags); + +/** + * six_lock_init - initialize a six lock + * @lock: lock to initialize + * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU + */ +#define six_lock_init(lock, flags) \ +do { \ + static struct lock_class_key __key; \ + \ + __six_lock_init((lock), #lock, &__key, flags); \ +} while (0) + +/** + * six_lock_seq - obtain current lock sequence number + * @lock: six_lock to obtain sequence number for + * + * @lock should be held for read or intent, and not write + * + * By saving the lock sequence number, we can unlock @lock and then (typically + * after some blocking operation) attempt to relock it: the relock will succeed + * if the sequence number hasn't changed, meaning no write locks have been taken + * and state corresponding to what @lock protects is still valid. + */ +static inline u32 six_lock_seq(const struct six_lock *lock) +{ + return lock->seq; +} + +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_trylock_type - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * Return: true on success, false on failure. + */ +static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + return six_trylock_ip(lock, type, _THIS_IP_); +} + +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip); + +/** + * six_lock_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * This is a convenience wrapper around six_lock_ip_waiter(), see that function + * for full documentation. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); +} + +/** + * six_lock_ip - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); +} + +/** + * six_lock_type - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); +} + +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip); + +/** + * six_relock_type - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * + * Return: true on success, false on failure. + */ +static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) +{ + return six_relock_ip(lock, type, seq, _THIS_IP_); +} + +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_unlock_type - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + six_unlock_ip(lock, type, _THIS_IP_); +} + +#define __SIX_LOCK(type) \ +static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline bool six_trylock_##type(struct six_lock *lock) \ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} \ + \ +static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn should_sleep_fn, void *p,\ + unsigned long ip) \ +{ \ + return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ +} \ + \ +static inline int six_lock_ip_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn should_sleep_fn, void *p, \ + unsigned long ip) \ +{ \ + return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ +} \ + \ +static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ +} \ + \ +static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ +} \ + \ +static inline int six_lock_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn fn, void *p)\ +{ \ + return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ +} \ + \ +static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline void six_unlock_##type(struct six_lock *lock) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} + +__SIX_LOCK(read) +__SIX_LOCK(intent) +__SIX_LOCK(write) +#undef __SIX_LOCK + +void six_lock_downgrade(struct six_lock *); +bool six_lock_tryupgrade(struct six_lock *); +bool six_trylock_convert(struct six_lock *, enum six_lock_type, + enum six_lock_type); + +void six_lock_increment(struct six_lock *, enum six_lock_type); + +void six_lock_wakeup_all(struct six_lock *); + +struct six_lock_count { + unsigned n[3]; +}; + +struct six_lock_count six_lock_counts(struct six_lock *); +void six_lock_readers_add(struct six_lock *, int); + +#endif /* _LINUX_SIX_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c new file mode 100644 index 000000000000..5dac038f0851 --- /dev/null +++ b/fs/bcachefs/snapshot.c @@ -0,0 +1,1713 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "buckets.h" +#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "snapshot.h" + +#include <linux/random.h> + +/* + * Snapshot trees: + * + * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they + * exist to provide a stable identifier for the whole lifetime of a snapshot + * tree. + */ + +void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); + + prt_printf(out, "subvol %u root snapshot %u", + le32_to_cpu(t.v->master_subvol), + le32_to_cpu(t.v->root_snapshot)); +} + +int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1)), c, err, + snapshot_tree_pos_bad, + "bad pos"); +fsck_err: + return ret; +} + +int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot_tree *s) +{ + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + + if (bch2_err_matches(ret, ENOENT)) + ret = -BCH_ERR_ENOENT_snapshot_tree; + return ret; +} + +struct bkey_i_snapshot_tree * +__bch2_snapshot_tree_create(struct btree_trans *trans) +{ + struct btree_iter iter; + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, U32_MAX)); + struct bkey_i_snapshot_tree *s_t; + + if (ret == -BCH_ERR_ENOSPC_btree_slot) + ret = -BCH_ERR_ENOSPC_snapshot_tree; + if (ret) + return ERR_PTR(ret); + + s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : s_t; +} + +static int bch2_snapshot_tree_create(struct btree_trans *trans, + u32 root_id, u32 subvol_id, u32 *tree_id) +{ + struct bkey_i_snapshot_tree *n_tree = + __bch2_snapshot_tree_create(trans); + + if (IS_ERR(n_tree)) + return PTR_ERR(n_tree); + + n_tree->v.master_subvol = cpu_to_le32(subvol_id); + n_tree->v.root_snapshot = cpu_to_le32(root_id); + *tree_id = n_tree->k.p.offset; + return 0; +} + +/* Snapshot nodes: */ + +static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor) + id = __snapshot_t(t, id)->parent; + rcu_read_unlock(); + + return id == ancestor; +} + +static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + + if (s->skip[2] <= ancestor) + return s->skip[2]; + if (s->skip[1] <= ancestor) + return s->skip[1]; + if (s->skip[0] <= ancestor) + return s->skip[0]; + return s->parent; +} + +bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + bool ret; + + EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); + + if (id && id < ancestor) { + ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); + + EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); + } else { + ret = id == ancestor; + } + + rcu_read_unlock(); + + return ret; +} + +static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + size_t new_size; + struct snapshot_table *new, *old; + + new_size = max(16UL, roundup_pow_of_two(idx + 1)); + + new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + if (!new) + return NULL; + + old = rcu_dereference_protected(c->snapshots, true); + if (old) + memcpy(new->s, + rcu_dereference_protected(c->snapshots, true)->s, + sizeof(new->s[0]) * c->snapshot_table_size); + + rcu_assign_pointer(c->snapshots, new); + c->snapshot_table_size = new_size; + kvfree_rcu_mightsleep(old); + + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; +} + +static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + + lockdep_assert_held(&c->snapshot_table_lock); + + if (likely(idx < c->snapshot_table_size)) + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + + return __snapshot_t_mut(c, id); +} + +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + + prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", + BCH_SNAPSHOT_SUBVOL(s.v), + BCH_SNAPSHOT_DELETED(s.v), + le32_to_cpu(s.v->parent), + le32_to_cpu(s.v->children[0]), + le32_to_cpu(s.v->children[1]), + le32_to_cpu(s.v->subvol), + le32_to_cpu(s.v->tree)); + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) + prt_printf(out, " depth %u skiplist %u %u %u", + le32_to_cpu(s.v->depth), + le32_to_cpu(s.v->skip[0]), + le32_to_cpu(s.v->skip[1]), + le32_to_cpu(s.v->skip[2])); +} + +int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_snapshot s; + u32 i, id; + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1)), c, err, + snapshot_pos_bad, + "bad pos"); + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); + bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, + snapshot_parent_bad, + "bad parent node (%u <= %llu)", + id, k.k->p.offset); + + bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, + snapshot_children_not_normalized, + "children not normalized"); + + bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, + snapshot_child_duplicate, + "duplicate child nodes"); + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + + bkey_fsck_err_on(id >= k.k->p.offset, c, err, + snapshot_child_bad, + "bad child node (%u >= %llu)", + id, k.k->p.offset); + } + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { + bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, + snapshot_skiplist_not_normalized, + "skiplist not normalized"); + + for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { + id = le32_to_cpu(s.v->skip[i]); + + bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, + snapshot_skiplist_bad, + "bad skiplist node %u", id); + } + } +fsck_err: + return ret; +} + +static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +{ + struct snapshot_t *t = snapshot_t_mut(c, id); + u32 parent = id; + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); +} + +static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +{ + mutex_lock(&c->snapshot_table_lock); + __set_is_ancestor_bitmap(c, id); + mutex_unlock(&c->snapshot_table_lock); +} + +int bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct snapshot_t *t; + u32 id = new.k->p.offset; + int ret = 0; + + mutex_lock(&c->snapshot_table_lock); + + t = snapshot_t_mut(c, id); + if (!t) { + ret = -BCH_ERR_ENOMEM_mark_snapshot; + goto err; + } + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + + t->parent = le32_to_cpu(s.v->parent); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); + t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + t->tree = le32_to_cpu(s.v->tree); + + if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { + t->depth = le32_to_cpu(s.v->depth); + t->skip[0] = le32_to_cpu(s.v->skip[0]); + t->skip[1] = le32_to_cpu(s.v->skip[1]); + t->skip[2] = le32_to_cpu(s.v->skip[2]); + } else { + t->depth = 0; + t->skip[0] = 0; + t->skip[1] = 0; + t->skip[2] = 0; + } + + __set_is_ancestor_bitmap(c, id); + + if (BCH_SNAPSHOT_DELETED(s.v)) { + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + bch2_delete_dead_snapshots_async(c); + } + } else { + memset(t, 0, sizeof(*t)); + } +err: + mutex_unlock(&c->snapshot_table_lock); + return ret; +} + +int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s) +{ + return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot, s); +} + +static int bch2_snapshot_live(struct btree_trans *trans, u32 id) +{ + struct bch_snapshot v; + int ret; + + if (!id) + return 0; + + ret = bch2_snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot node %u not found", id); + if (ret) + return ret; + + return !BCH_SNAPSHOT_DELETED(&v); +} + +/* + * If @k is a snapshot with just one live child, it's part of a linear chain, + * which we consider to be an equivalence class: and then after snapshot + * deletion cleanup, there should only be a single key at a given position in + * this equivalence class. + * + * This sets the equivalence class of @k to be the child's equivalence class, if + * it's part of such a linear chain: this correctly sets equivalence classes on + * startup if we run leaf to root (i.e. in natural key order). + */ +static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + unsigned i, nr_live = 0, live_idx = 0; + struct bkey_s_c_snapshot snap; + u32 id = k.k->p.offset, child[2]; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + + child[0] = le32_to_cpu(snap.v->children[0]); + child[1] = le32_to_cpu(snap.v->children[1]); + + for (i = 0; i < 2; i++) { + int ret = bch2_snapshot_live(trans, child[i]); + + if (ret < 0) + return ret; + + if (ret) + live_idx = i; + nr_live += ret; + } + + mutex_lock(&c->snapshot_table_lock); + + snapshot_t_mut(c, id)->equiv = nr_live == 1 + ? snapshot_t_mut(c, child[live_idx])->equiv + : id; + + mutex_unlock(&c->snapshot_table_lock); + + return 0; +} + +/* fsck: */ + +static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) +{ + return snapshot_t(c, id)->children[child]; +} + +static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 0); +} + +static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 1); +} + +static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) +{ + u32 n, parent; + + n = bch2_snapshot_left_child(c, id); + if (n) + return n; + + while ((parent = bch2_snapshot_parent(c, id))) { + n = bch2_snapshot_right_child(c, parent); + if (n && n != id) + return n; + id = parent; + } + + return 0; +} + +static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +{ + u32 id = snapshot_root; + u32 subvol = 0, s; + + while (id) { + s = snapshot_t(c, id)->subvol; + + if (s && (!subvol || s < subvol)) + subvol = s; + + id = bch2_snapshot_tree_next(c, id); + } + + return subvol; +} + +static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, + u32 snapshot_root, u32 *subvol_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume s; + bool found = false; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + s = bkey_s_c_to_subvolume(k); + if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) + continue; + if (!BCH_SUBVOLUME_SNAP(s.v)) { + *subvol_id = s.k->p.offset; + found = true; + break; + } + } + + bch2_trans_iter_exit(trans, &iter); + + if (!ret && !found) { + struct bkey_i_subvolume *u; + + *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + + u = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, *subvol_id), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + SET_BCH_SUBVOLUME_SNAP(&u->v, false); + } + + return ret; +} + +static int check_snapshot_tree(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot_tree st; + struct bch_snapshot s; + struct bch_subvolume subvol; + struct printbuf buf = PRINTBUF; + u32 root_id; + int ret; + + if (k.k->type != KEY_TYPE_snapshot_tree) + return 0; + + st = bkey_s_c_to_snapshot_tree(k); + root_id = le32_to_cpu(st.v->root_snapshot); + + ret = bch2_snapshot_lookup(trans, root_id, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret || + root_id != bch2_snapshot_root(c, root_id) || + st.k->p.offset != le32_to_cpu(s.tree), + c, snapshot_tree_to_missing_snapshot, + "snapshot tree points to missing/incorrect snapshot:\n %s", + (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto err; + } + + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), + false, 0, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, + c, snapshot_tree_to_missing_subvol, + "snapshot tree points to missing subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + le32_to_cpu(subvol.snapshot), + root_id), + c, snapshot_tree_to_wrong_subvol, + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), + c, snapshot_tree_to_snapshot_subvol, + "snapshot tree points to snapshot subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + struct bkey_i_snapshot_tree *u; + u32 subvol_id; + + ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + if (ret) + goto err; + + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.master_subvol = cpu_to_le32(subvol_id); + st = snapshot_tree_i_to_s_c(u); + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * For each snapshot_tree, make sure it points to the root of a snapshot tree + * and that snapshot entry points back to it, or delete it. + * + * And, make sure it points to a subvolume within that snapshot tree, or correct + * it to point to the oldest subvolume within that snapshot tree. + */ +int bch2_check_snapshot_trees(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_snapshot_trees, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_snapshot_tree(trans, &iter, k))); + + if (ret) + bch_err(c, "error %i checking snapshot trees", ret); + return ret; +} + +/* + * Look up snapshot tree for @tree_id and find root, + * make sure @snap_id is a descendent: + */ +static int snapshot_tree_ptr_good(struct btree_trans *trans, + u32 snap_id, u32 tree_id) +{ + struct bch_snapshot_tree s_t; + int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + + if (bch2_err_matches(ret, ENOENT)) + return 0; + if (ret) + return ret; + + return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); +} + +u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + + if (!id) + return 0; + + rcu_read_lock(); + s = snapshot_t(c, id); + if (s->parent) + id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); + rcu_read_unlock(); + + return id; +} + +static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) +{ + unsigned i; + + for (i = 0; i < 3; i++) + if (!s.parent) { + if (s.skip[i]) + return false; + } else { + if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) + return false; + } + + return true; +} + +/* + * snapshot_tree pointer was incorrect: look up root snapshot node, make sure + * its snapshot_tree pointer is correct (allocate new one if necessary), then + * update this node's pointer to root node's pointer: + */ +static int snapshot_tree_ptr_repair(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_snapshot *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter root_iter; + struct bch_snapshot_tree s_t; + struct bkey_s_c_snapshot root; + struct bkey_i_snapshot *u; + u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; + int ret; + + root = bch2_bkey_get_iter_typed(trans, &root_iter, + BTREE_ID_snapshots, POS(0, root_id), + BTREE_ITER_WITH_UPDATES, snapshot); + ret = bkey_err(root); + if (ret) + goto err; + + tree_id = le32_to_cpu(root.v->tree); + + ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { + u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u) ?: + bch2_snapshot_tree_create(trans, root_id, + bch2_snapshot_tree_oldest_subvol(c, root_id), + &tree_id); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + if (k.k->p.offset == root_id) + *s = u->v; + } + + if (k.k->p.offset != root_id) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + *s = u->v; + } +err: + bch2_trans_iter_exit(trans, &root_iter); + return ret; +} + +static int check_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bch_snapshot s; + struct bch_subvolume subvol; + struct bch_snapshot v; + struct bkey_i_snapshot *u; + u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); + u32 real_depth; + struct printbuf buf = PRINTBUF; + bool should_have_subvol; + u32 i, id; + int ret = 0; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + memset(&s, 0, sizeof(s)); + memcpy(&s, k.v, bkey_val_bytes(k.k)); + + id = le32_to_cpu(s.parent); + if (id) { + ret = bch2_snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot with nonexistent parent:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + if (le32_to_cpu(v.children[0]) != k.k->p.offset && + le32_to_cpu(v.children[1]) != k.k->p.offset) { + bch_err(c, "snapshot parent %u missing pointer to child %llu", + id, k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + + for (i = 0; i < 2 && s.children[i]; i++) { + id = le32_to_cpu(s.children[i]); + + ret = bch2_snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot node %llu has nonexistent child %u", + k.k->p.offset, id); + if (ret) + goto err; + + if (le32_to_cpu(v.parent) != k.k->p.offset) { + bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", + id, le32_to_cpu(v.parent), k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + + should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + !BCH_SNAPSHOT_DELETED(&s); + + if (should_have_subvol) { + id = le32_to_cpu(s.subvol); + ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot points to nonexistent subvolume:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { + bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", + k.k->p.offset); + ret = -EINVAL; + goto err; + } + } else { + if (fsck_err_on(s.subvol, + c, snapshot_should_not_have_subvol, + "snapshot should not point to subvol:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.subvol = 0; + s = u->v; + } + } + + ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree, + "snapshot points to missing/incorrect tree:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = snapshot_tree_ptr_repair(trans, iter, k, &s); + if (ret) + goto err; + } + ret = 0; + + real_depth = bch2_snapshot_depth(c, parent_id); + + if (le32_to_cpu(s.depth) != real_depth && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, snapshot_bad_depth, + "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.depth = cpu_to_le32(real_depth); + s = u->v; + } + + ret = snapshot_skiplist_good(trans, k.k->p.offset, s); + if (ret < 0) + goto err; + + if (!ret && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, snapshot_bad_skiplist, + "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) + u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); + + bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); + s = u->v; + } + ret = 0; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_snapshots(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + /* + * We iterate backwards as checking/fixing the depth field requires that + * the parent's depth already be correct: + */ + ret = bch2_trans_run(c, + for_each_btree_key_reverse_commit(trans, iter, + BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_snapshot(trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* + * Mark a snapshot as deleted, for future cleanup: + */ +int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter; + struct bkey_i_snapshot *s; + int ret = 0; + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + trans->c, "missing snapshot %u", id); + return ret; + } + + /* already deleted? */ + if (BCH_SNAPSHOT_DELETED(&s->v)) + goto err; + + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); + s->v.subvol = 0; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) +{ + if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) + swap(s->children[0], s->children[1]); +} + +static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; + struct btree_iter c_iter = (struct btree_iter) { NULL }; + struct btree_iter tree_iter = (struct btree_iter) { NULL }; + struct bkey_s_c_snapshot s; + u32 parent_id, child_id; + unsigned i; + int ret = 0; + + s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT, snapshot); + ret = bkey_err(s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", id); + + if (ret) + goto err; + + BUG_ON(s.v->children[1]); + + parent_id = le32_to_cpu(s.v->parent); + child_id = le32_to_cpu(s.v->children[0]); + + if (parent_id) { + struct bkey_i_snapshot *parent; + + parent = bch2_bkey_get_mut_typed(trans, &p_iter, + BTREE_ID_snapshots, POS(0, parent_id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(parent); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", parent_id); + if (unlikely(ret)) + goto err; + + /* find entry in parent->children for node being deleted */ + for (i = 0; i < 2; i++) + if (le32_to_cpu(parent->v.children[i]) == id) + break; + + if (bch2_fs_inconsistent_on(i == 2, c, + "snapshot %u missing child pointer to %u", + parent_id, id)) + goto err; + + parent->v.children[i] = cpu_to_le32(child_id); + + normalize_snapshot_child_pointers(&parent->v); + } + + if (child_id) { + struct bkey_i_snapshot *child; + + child = bch2_bkey_get_mut_typed(trans, &c_iter, + BTREE_ID_snapshots, POS(0, child_id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(child); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", child_id); + if (unlikely(ret)) + goto err; + + child->v.parent = cpu_to_le32(parent_id); + + if (!child->v.parent) { + child->v.skip[0] = 0; + child->v.skip[1] = 0; + child->v.skip[2] = 0; + } + } + + if (!parent_id) { + /* + * We're deleting the root of a snapshot tree: update the + * snapshot_tree entry to point to the new root, or delete it if + * this is the last snapshot ID in this tree: + */ + struct bkey_i_snapshot_tree *s_t; + + BUG_ON(s.v->children[1]); + + s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + if (ret) + goto err; + + if (s.v->children[0]) { + s_t->v.root_snapshot = s.v->children[0]; + } else { + s_t->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s_t->k, 0); + } + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &tree_iter); + bch2_trans_iter_exit(trans, &p_iter); + bch2_trans_iter_exit(trans, &c_iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; + unsigned i, j; + u32 depth = bch2_snapshot_depth(c, parent); + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS_MIN, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + for (i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !k.k->p.offset) { + ret = -BCH_ERR_ENOSPC_snapshot_create; + goto err; + } + + n = bch2_bkey_alloc(trans, &iter, 0, snapshot); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.tree = cpu_to_le32(tree); + n->v.depth = cpu_to_le32(depth); + + for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) + n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); + + bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) + goto err; + + new_snapids[i] = iter.pos.offset; + + mutex_lock(&c->snapshot_table_lock); + snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; + mutex_unlock(&c->snapshot_table_lock); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create new snapshot IDs as children of an existing snapshot ID: + */ +static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n_parent; + int ret = 0; + + n_parent = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, parent), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(n_parent); + if (unlikely(ret)) { + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot %u not found", parent); + return ret; + } + + if (n_parent->v.children[0] || n_parent->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } + + ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + goto err; + + n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); + n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); + n_parent->v.subvol = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create a snapshot node that is the root of a new tree: + */ +static int bch2_snapshot_node_create_tree(struct btree_trans *trans, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bkey_i_snapshot_tree *n_tree; + int ret; + + n_tree = __bch2_snapshot_tree_create(trans); + ret = PTR_ERR_OR_ZERO(n_tree) ?: + create_snapids(trans, 0, n_tree->k.p.offset, + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + return ret; + + n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); + n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); + return 0; +} + +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + BUG_ON((parent == 0) != (nr_snapids == 1)); + BUG_ON((parent != 0) != (nr_snapids == 2)); + + return parent + ? bch2_snapshot_node_create_children(trans, parent, + new_snapids, snapshot_subvols, nr_snapids) + : bch2_snapshot_node_create_tree(trans, + new_snapids, snapshot_subvols, nr_snapids); + +} + +/* + * If we have an unlinked inode in an internal snapshot node, and the inode + * really has been deleted in all child snapshots, how does this get cleaned up? + * + * first there is the problem of how keys that have been overwritten in all + * child snapshots get deleted (unimplemented?), but inodes may perhaps be + * special? + * + * also: unlinked inode in internal snapshot appears to not be getting deleted + * correctly if inode doesn't exist in leaf snapshots + * + * solution: + * + * for a key in an interior snapshot node that needs work to be done that + * requires it to be mutated: iterate over all descendent leaf nodes and copy + * that key to snapshot leaf nodes, where we can mutate it + */ + +static int snapshot_delete_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *deleted, + snapshot_id_list *equiv_seen, + struct bpos *last_pos) +{ + struct bch_fs *c = trans->c; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + if (!bkey_eq(k.k->p, *last_pos)) + equiv_seen->nr = 0; + *last_pos = k.k->p; + + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(equiv_seen, equiv)) { + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } else { + return snapshot_list_add(c, equiv_seen, equiv); + } +} + +static int move_key_to_correct_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + /* + * When we have a linear chain of snapshot nodes, we consider + * those to form an equivalence class: we're going to collapse + * them all down to a single node, and keep the leaf-most node - + * which has the same id as the equivalence class id. + * + * If there are multiple keys in different snapshots at the same + * position, we're only going to keep the one in the newest + * snapshot - the rest have been overwritten and are redundant, + * and for the key we're going to keep we need to move it to the + * equivalance class ID if it's not there already. + */ + if (equiv != k.k->p.snapshot) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + struct btree_iter new_iter; + int ret; + + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + new->k.p.snapshot = equiv; + + bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&new_iter) ?: + bch2_trans_update(trans, &new_iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &new_iter); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_s_c_snapshot snap; + u32 children[2]; + int ret; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + return 0; + + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); + + ret = bch2_snapshot_live(trans, children[0]) ?: + bch2_snapshot_live(trans, children[1]); + if (ret < 0) + return ret; + return !ret; +} + +/* + * For a given snapshot, if it doesn't have a subvolume that points to it, and + * it doesn't have child snapshot nodes - it's now redundant and we can mark it + * as deleted. + */ +static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) +{ + int ret = bch2_snapshot_needs_delete(trans, k); + + return ret <= 0 + ? ret + : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); +} + +static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, + snapshot_id_list *skip) +{ + rcu_read_lock(); + while (snapshot_list_has_id(skip, id)) + id = __bch2_snapshot_parent(c, id); + + while (n--) { + do { + id = __bch2_snapshot_parent(c, id); + } while (snapshot_list_has_id(skip, id)); + } + rcu_read_unlock(); + + return id; +} + +static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k, + snapshot_id_list *deleted) +{ + struct bch_fs *c = trans->c; + u32 nr_deleted_ancestors = 0; + struct bkey_i_snapshot *s; + u32 *i; + int ret; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + if (snapshot_list_has_id(deleted, k.k->p.offset)) + return 0; + + s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + darray_for_each(*deleted, i) + nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); + + if (!nr_deleted_ancestors) + return 0; + + le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); + + if (!s->v.depth) { + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + u32 depth = le32_to_cpu(s->v.depth); + u32 parent = bch2_snapshot_parent(c, s->k.p.offset); + + for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { + u32 id = le32_to_cpu(s->v.skip[j]); + + if (snapshot_list_has_id(deleted, id)) { + id = bch2_snapshot_nth_parent_skip(c, + parent, + depth > 1 + ? get_random_u32_below(depth - 1) + : 0, + deleted); + s->v.skip[j] = cpu_to_le32(id); + } + } + + bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); + } + + return bch2_trans_update(trans, iter, &s->k_i, 0); +} + +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + snapshot_id_list deleted = { 0 }; + snapshot_id_list deleted_interior = { 0 }; + u32 *i, id; + int ret = 0; + + if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) + return 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) { + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + return ret; + } + } + + trans = bch2_trans_get(c); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + NULL, NULL, 0, + bch2_delete_redundant_snapshot(trans, k)); + if (ret) { + bch_err_msg(c, ret, "deleting redundant snapshots"); + goto err; + } + + ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_snapshot_set_equiv(trans, k)); + if (ret) { + bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + goto err; + } + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { + ret = snapshot_list_add(c, &deleted, k.k->p.offset); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) { + bch_err_msg(c, ret, "walking snapshots"); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { + struct bpos last_pos = POS_MIN; + snapshot_id_list equiv_seen = { 0 }; + struct disk_reservation res = { 0 }; + + if (!btree_type_has_snapshots(id)) + continue; + + /* + * deleted inodes btree is maintained by a trigger on the inodes + * btree - no work for us to do here, and it's not safe to scan + * it because we'll see out of date keys due to the btree write + * buffer: + */ + if (id == BTREE_ID_deleted_inodes) + continue; + + ret = for_each_btree_key_commit(trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, BTREE_INSERT_NOFAIL, + snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: + for_each_btree_key_commit(trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, BTREE_INSERT_NOFAIL, + move_key_to_correct_snapshot(trans, &iter, k)); + + bch2_disk_reservation_put(c, &res); + darray_exit(&equiv_seen); + + if (ret) { + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + goto err; + } + } + + bch2_trans_unlock(trans); + down_write(&c->snapshot_create_lock); + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 snapshot = k.k->p.offset; + u32 equiv = bch2_snapshot_equiv(c, snapshot); + + if (equiv != snapshot) + snapshot_list_add(c, &deleted_interior, snapshot); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + goto err_create_lock; + + /* + * Fixing children of deleted snapshots can't be done completely + * atomically, if we crash between here and when we delete the interior + * nodes some depth fields will be off: + */ + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, + BTREE_ITER_INTENT, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); + if (ret) + goto err_create_lock; + + darray_for_each(deleted, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + if (ret) { + bch_err_msg(c, ret, "deleting snapshot %u", *i); + goto err_create_lock; + } + } + + darray_for_each(deleted_interior, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + if (ret) { + bch_err_msg(c, ret, "deleting snapshot %u", *i); + goto err_create_lock; + } + } +err_create_lock: + up_write(&c->snapshot_create_lock); +err: + darray_exit(&deleted_interior); + darray_exit(&deleted); + bch2_trans_put(trans); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + + bch2_delete_dead_snapshots(c); + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +void bch2_delete_dead_snapshots_async(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && + !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (!k.k) + break; + + if (!bkey_eq(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s = snapshot_t(c, id); + + return s->children[1] ?: s->children[0]; +} + +static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) +{ + u32 child; + + while ((child = bch2_snapshot_smallest_child(c, id))) + id = child; + return id; +} + +static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, + enum btree_id btree, + struct bkey_s_c interior_k, + u32 leaf_id, struct bpos *new_min_pos) +{ + struct btree_iter iter; + struct bpos pos = interior_k.k->p; + struct bkey_s_c k; + struct bkey_i *new; + int ret; + + pos.snapshot = leaf_id; + + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; + + /* key already overwritten in this snapshot? */ + if (k.k->p.snapshot != interior_k.k->p.snapshot) + goto out; + + if (bpos_eq(*new_min_pos, POS_MIN)) { + *new_min_pos = k.k->p; + new_min_pos->snapshot = leaf_id; + } + + new = bch2_bkey_make_mut_noupdate(trans, interior_k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + new->k.p.snapshot = leaf_id; + ret = bch2_trans_update(trans, &iter, new, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, + enum btree_id btree, + struct bkey_s_c k, + struct bpos *new_min_pos) +{ + struct bch_fs *c = trans->c; + struct bkey_buf sk; + u32 restart_count = trans->restart_count; + int ret = 0; + + bch2_bkey_buf_init(&sk); + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + *new_min_pos = POS_MIN; + + for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); + id < k.k->p.snapshot; + id++) { + if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || + !bch2_snapshot_is_leaf(c, id)) + continue; +again: + ret = btree_trans_too_many_iters(trans) ?: + bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_begin(trans); + goto again; + } + + if (ret) + break; + } + + bch2_bkey_buf_exit(&sk, c); + + return ret ?: trans_was_restarted(trans, restart_count); +} + +static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot snap; + int ret = 0; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || + (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + return 0; + } + + return ret; +} + +int bch2_snapshots_read(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + bch2_snapshot_set_equiv(trans, k) ?: + bch2_check_snapshot_needs_deletion(trans, k)) ?: + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +void bch2_fs_snapshots_exit(struct bch_fs *c) +{ + kfree(rcu_dereference_protected(c->snapshots, true)); +} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h new file mode 100644 index 000000000000..f09a22f44239 --- /dev/null +++ b/fs/bcachefs/snapshot.h @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_H +#define _BCACHEFS_SNAPSHOT_H + +enum bkey_invalid_flags; + +void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); + +#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_tree_invalid, \ + .val_to_text = bch2_snapshot_tree_to_text, \ + .min_val_size = 8, \ +}) + +struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); + +int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + +#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ + .val_to_text = bch2_snapshot_to_text, \ + .atomic_trigger = bch2_mark_snapshot, \ + .min_val_size = 24, \ +}) + +static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) +{ + return &t->s[U32_MAX - id]; +} + +static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +{ + return __snapshot_t(rcu_dereference(c->snapshots), id); +} + +static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = snapshot_t(c, id)->tree; + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->parent; +} + +static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent_early(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + u32 parent = snapshot_t(c, id)->parent; + + if (parent && + snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + panic("id %u depth=%u parent %u depth=%u\n", + id, snapshot_t(c, id)->depth, + parent, snapshot_t(c, parent)->depth); + + return parent; +#else + return snapshot_t(c, id)->parent; +#endif +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) +{ + rcu_read_lock(); + while (n--) + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); + +static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) +{ + u32 parent; + + rcu_read_lock(); + while ((parent = __bch2_snapshot_parent(c, id))) + id = parent; + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->equiv; +} + +static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_equiv(c, id); + rcu_read_unlock(); + + return id; +} + +static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) +{ + return id == bch2_snapshot_equiv(c, id); +} + +static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + bool ret; + + rcu_read_lock(); + s = snapshot_t(c, id); + ret = s->children[0]; + rcu_read_unlock(); + + return ret; +} + +static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +{ + return !bch2_snapshot_is_internal_node(c, id); +} + +static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + u32 parent = __bch2_snapshot_parent(c, id); + + if (!parent) + return 0; + + s = snapshot_t(c, __bch2_snapshot_parent(c, id)); + if (id == s->children[0]) + return s->children[1]; + if (id == s->children[1]) + return s->children[0]; + return 0; +} + +static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) +{ + u32 depth; + + rcu_read_lock(); + depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; + rcu_read_unlock(); + + return depth; +} + +bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); + +static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + return id == ancestor + ? true + : __bch2_snapshot_is_ancestor(c, id, ancestor); +} + +static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *t; + bool ret; + + rcu_read_lock(); + t = snapshot_t(c, id); + ret = (t->children[0]|t->children[1]) != 0; + rcu_read_unlock(); + + return ret; +} + +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) +{ + u32 *i; + + darray_for_each(*s, i) + if (*i == id) + return true; + return false; +} + +static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + u32 *i; + + darray_for_each(*s, i) + if (bch2_snapshot_is_ancestor(c, id, *i)) + return true; + return false; +} + +static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret; + + BUG_ON(snapshot_list_has_id(s, id)); + ret = darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s); +int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); + +/* only exported for tests: */ +int bch2_snapshot_node_create(struct btree_trans *, u32, + u32 *, u32 *, unsigned); + +int bch2_check_snapshot_trees(struct bch_fs *); +int bch2_check_snapshots(struct bch_fs *); + +int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); +void bch2_delete_dead_snapshots_work(struct work_struct *); + +int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); + +static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || + bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + return 0; + + return __bch2_key_has_snapshot_overwrites(trans, id, pos); +} + +int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, + struct bkey_s_c, struct bpos *); + +int bch2_snapshots_read(struct bch_fs *); +void bch2_fs_snapshots_exit(struct bch_fs *); + +#endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h new file mode 100644 index 000000000000..ae21a8cca1b4 --- /dev/null +++ b/fs/bcachefs/str_hash.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_STR_HASH_H +#define _BCACHEFS_STR_HASH_H + +#include "btree_iter.h" +#include "btree_update.h" +#include "checksum.h" +#include "error.h" +#include "inode.h" +#include "siphash.h" +#include "subvolume.h" +#include "super.h" + +#include <linux/crc32c.h> +#include <crypto/hash.h> +#include <crypto/sha2.h> + +static inline enum bch_str_hash_type +bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) +{ + switch (opt) { + case BCH_STR_HASH_OPT_crc32c: + return BCH_STR_HASH_crc32c; + case BCH_STR_HASH_OPT_crc64: + return BCH_STR_HASH_crc64; + case BCH_STR_HASH_OPT_siphash: + return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) + ? BCH_STR_HASH_siphash + : BCH_STR_HASH_siphash_old; + default: + BUG(); + } +} + +struct bch_hash_info { + u8 type; + /* + * For crc32 or crc64 string hashes the first key value of + * the siphash_key (k0) is used as the key. + */ + SIPHASH_KEY siphash_key; +}; + +static inline struct bch_hash_info +bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) +{ + /* XXX ick */ + struct bch_hash_info info = { + .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & + ~(~0U << INODE_STR_HASH_BITS), + .siphash_key = { .k0 = bi->bi_hash_seed } + }; + + if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { + SHASH_DESC_ON_STACK(desc, c->sha256); + u8 digest[SHA256_DIGEST_SIZE]; + + desc->tfm = c->sha256; + + crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, + sizeof(bi->bi_hash_seed), digest); + memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); + } + + return info; +} + +struct bch_str_hash_ctx { + union { + u32 crc32c; + u64 crc64; + SIPHASH_CTX siphash; + }; +}; + +static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) +{ + switch (info->type) { + case BCH_STR_HASH_crc32c: + ctx->crc32c = crc32c(~0, &info->siphash_key.k0, + sizeof(info->siphash_key.k0)); + break; + case BCH_STR_HASH_crc64: + ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, + sizeof(info->siphash_key.k0)); + break; + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: + SipHash24_Init(&ctx->siphash, &info->siphash_key); + break; + default: + BUG(); + } +} + +static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info, + const void *data, size_t len) +{ + switch (info->type) { + case BCH_STR_HASH_crc32c: + ctx->crc32c = crc32c(ctx->crc32c, data, len); + break; + case BCH_STR_HASH_crc64: + ctx->crc64 = crc64_be(ctx->crc64, data, len); + break; + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: + SipHash24_Update(&ctx->siphash, data, len); + break; + default: + BUG(); + } +} + +static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) +{ + switch (info->type) { + case BCH_STR_HASH_crc32c: + return ctx->crc32c; + case BCH_STR_HASH_crc64: + return ctx->crc64 >> 1; + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: + return SipHash24_End(&ctx->siphash) >> 1; + default: + BUG(); + } +} + +struct bch_hash_desc { + enum btree_id btree_id; + u8 key_type; + + u64 (*hash_key)(const struct bch_hash_info *, const void *); + u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); + bool (*cmp_key)(struct bkey_s_c, const void *); + bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); + bool (*is_visible)(subvol_inum inum, struct bkey_s_c); +}; + +static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) +{ + return k.k->type == desc.key_type && + (!desc.is_visible || + !inum.inum || + desc.is_visible(inum, k)); +} + +static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + struct bkey_s_c k; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|flags, k, ret) { + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_key(k, key)) + return 0; + } else if (k.k->type == KEY_TYPE_hash_whiteout) { + ; + } else { + /* hole, not found */ + break; + } + } + bch2_trans_iter_exit(trans, iter); + + return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; +} + +static __always_inline int +bch2_hash_hole(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key) +{ + struct bkey_s_c k; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + if (!is_visible_key(desc, inum, k)) + return 0; + bch2_trans_iter_exit(trans, iter); + + return ret ?: -BCH_ERR_ENOSPC_str_hash_create; +} + +static __always_inline +int bch2_hash_needs_whiteout(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *start) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_copy_iter(&iter, start); + + bch2_btree_iter_advance(&iter); + + for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { + if (k.k->type != desc.key_type && + k.k->type != KEY_TYPE_hash_whiteout) + break; + + if (k.k->type == desc.key_type && + desc.hash_bkey(info, k) <= start->pos.offset) { + ret = 1; + break; + } + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static __always_inline +int bch2_hash_set_snapshot(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, u32 snapshot, + struct bkey_i *insert, + int flags, + int update_flags) +{ + struct btree_iter iter, slot = { NULL }; + struct bkey_s_c k; + bool found = false; + int ret; + + for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + SPOS(insert->k.p.inode, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), + POS(insert->k.p.inode, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) + goto found; + + /* hash collision: */ + continue; + } + + if (!slot.path && + !(flags & BCH_HASH_SET_MUST_REPLACE)) + bch2_trans_copy_iter(&slot, &iter); + + if (k.k->type != KEY_TYPE_hash_whiteout) + goto not_found; + } + + if (!ret) + ret = -BCH_ERR_ENOSPC_str_hash_create; +out: + bch2_trans_iter_exit(trans, &slot); + bch2_trans_iter_exit(trans, &iter); + + return ret; +found: + found = true; +not_found: + + if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { + ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; + } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + ret = -EEXIST; + } else { + if (!found && slot.path) + swap(iter, slot); + + insert->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, insert, 0); + } + + goto out; +} + +static __always_inline +int bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, + struct bkey_i *insert, int flags) +{ + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + insert->k.p.inode = inum.inum; + + return bch2_hash_set_snapshot(trans, desc, info, inum, + snapshot, insert, flags, 0); +} + +static __always_inline +int bch2_hash_delete_at(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter, + unsigned update_flags) +{ + struct bkey_i *delete; + int ret; + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + ret = PTR_ERR_OR_ZERO(delete); + if (ret) + return ret; + + ret = bch2_hash_needs_whiteout(trans, desc, info, iter); + if (ret < 0) + return ret; + + bkey_init(&delete->k); + delete->k.p = iter->pos; + delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; + + return bch2_trans_update(trans, iter, delete, update_flags); +} + +static __always_inline +int bch2_hash_delete(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key) +{ + struct btree_iter iter; + int ret; + + ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, + BTREE_ITER_INTENT); + if (ret) + return ret; + + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 index 000000000000..fccd25aa3242 --- /dev/null +++ b/fs/bcachefs/subvolume.c @@ -0,0 +1,437 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "snapshot.h" +#include "subvolume.h" + +#include <linux/random.h> + +static int bch2_subvolume_delete(struct btree_trans *, u32); + +static int check_subvol(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_subvolume subvol; + struct bch_snapshot snapshot; + unsigned snapid; + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + ret = bch2_snapshot_lookup(trans, snapid, &snapshot); + + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "subvolume %llu points to nonexistent snapshot %u", + k.k->p.offset, snapid); + if (ret) + return ret; + + if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + bch2_fs_lazy_rw(c); + + ret = bch2_subvolume_delete(trans, iter->pos.offset); + if (ret) + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + return ret ?: -BCH_ERR_transaction_restart_nested; + } + + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { + u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); + u32 snapshot_tree; + struct bch_snapshot_tree st; + + rcu_read_lock(); + snapshot_tree = snapshot_t(c, snapshot_root)->tree; + rcu_read_unlock(); + + ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, snapshot_tree); + + if (ret) + return ret; + + if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, + c, subvol_not_master_and_not_snapshot, + "subvolume %llu is not set as snapshot but is not master subvolume", + k.k->p.offset)) { + struct bkey_i_subvolume *s = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + SET_BCH_SUBVOLUME_SNAP(&s->v, true); + } + } + +fsck_err: + return ret; +} + +int bch2_check_subvols(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_subvol(trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* Subvolumes: */ + +int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) || + bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err, + subvol_pos_bad, + "invalid pos"); +fsck_err: + return ret; +} + +void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + prt_printf(out, "root %llu snapshot id %u", + le64_to_cpu(s.v->inode), + le32_to_cpu(s.v->snapshot)); + + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) + prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); +} + +static __always_inline int +bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, + int iter_flags, + struct bch_subvolume *s) +{ + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), + iter_flags, subvolume, s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && + inconsistent_if_not_found, + trans->c, "missing subvolume %u", subvol); + return ret; +} + +int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, + int iter_flags, + struct bch_subvolume *s) +{ + return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); +} + +int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + struct bch_subvolume *subvol) +{ + struct bch_snapshot snap; + + return bch2_snapshot_lookup(trans, snapshot, &snap) ?: + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); +} + +int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, + u32 *snapid) +{ + struct btree_iter iter; + struct bkey_s_c_subvolume subvol; + int ret; + + subvol = bch2_bkey_get_iter_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, + subvolume); + ret = bkey_err(subvol); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); + + if (likely(!ret)) + *snapid = le32_to_cpu(subvol.v->snapshot); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_subvolume_reparent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + u32 old_parent, u32 new_parent) +{ + struct bkey_i_subvolume *s; + int ret; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + return 0; + + s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + s->v.parent = cpu_to_le32(new_parent); + return 0; +} + +/* + * Separate from the snapshot tree in the snapshots btree, we record the tree + * structure of how snapshot subvolumes were created - the parent subvolume of + * each snapshot subvolume. + * + * When a subvolume is deleted, we scan for child subvolumes and reparant them, + * to avoid dangling references: + */ +static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_subvolume s; + + return lockrestart_do(trans, + bch2_subvolume_get(trans, subvolid_to_delete, true, + BTREE_ITER_CACHED, &s)) ?: + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_reparent(trans, &iter, k, + subvolid_to_delete, le32_to_cpu(s.parent))); +} + +/* + * Delete subvolume, mark snapshot ID as deleted, queue up snapshot + * deletion/cleanup: + */ +static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_s_c_subvolume subvol; + u32 snapid; + int ret = 0; + + subvol = bch2_bkey_get_iter_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED|BTREE_ITER_INTENT, + subvolume); + ret = bkey_err(subvol); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); + if (ret) + return ret; + + snapid = le32_to_cpu(subvol.v->snapshot); + + ret = bch2_btree_delete_at(trans, &iter, 0) ?: + bch2_snapshot_node_set_deleted(trans, snapid); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +{ + return bch2_subvolumes_reparent(trans, subvolid) ?: + commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_subvolume_delete(trans, subvolid)); +} + +static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); + snapshot_id_list s; + u32 *id; + int ret = 0; + + while (!ret) { + mutex_lock(&c->snapshots_unlinked_lock); + s = c->snapshots_unlinked; + darray_init(&c->snapshots_unlinked); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (!s.nr) + break; + + bch2_evict_subvolume_inodes(c, &s); + + for (id = s.data; id < s.data + s.nr; id++) { + ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); + if (ret) { + bch_err_msg(c, ret, "deleting subvolume %u", *id); + break; + } + } + + darray_exit(&s); + } + + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); +} + +struct subvolume_unlink_hook { + struct btree_trans_commit_hook h; + u32 subvol; +}; + +static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *_h) +{ + struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); + struct bch_fs *c = trans->c; + int ret = 0; + + mutex_lock(&c->snapshots_unlinked_lock); + if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) + ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (ret) + return ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) + return -EROFS; + + if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + return 0; +} + +int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_i_subvolume *n; + struct subvolume_unlink_hook *h; + int ret = 0; + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + return ret; + + h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; + h->subvol = subvolid; + bch2_trans_commit_hook(trans, &h->h); + + n = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); + return ret; + } + + SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 src_subvolid, + u32 *new_subvolid, + u32 *new_snapshotid, + bool ro) +{ + struct bch_fs *c = trans->c; + struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct bkey_i_subvolume *new_subvol = NULL; + struct bkey_i_subvolume *src_subvol = NULL; + u32 parent = 0, new_nodes[2], snapshot_subvols[2]; + int ret = 0; + + ret = bch2_bkey_get_empty_slot(trans, &dst_iter, + BTREE_ID_subvolumes, POS(0, U32_MAX)); + if (ret == -BCH_ERR_ENOSPC_btree_slot) + ret = -BCH_ERR_ENOSPC_subvolume_create; + if (ret) + return ret; + + snapshot_subvols[0] = dst_iter.pos.offset; + snapshot_subvols[1] = src_subvolid; + + if (src_subvolid) { + /* Creating a snapshot: */ + + src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, + BTREE_ID_subvolumes, POS(0, src_subvolid), + BTREE_ITER_CACHED, subvolume); + ret = PTR_ERR_OR_ZERO(src_subvol); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "subvolume %u not found", src_subvolid); + goto err; + } + + parent = le32_to_cpu(src_subvol->v.snapshot); + } + + ret = bch2_snapshot_node_create(trans, parent, new_nodes, + snapshot_subvols, + src_subvolid ? 2 : 1); + if (ret) + goto err; + + if (src_subvolid) { + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); + ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); + if (ret) + goto err; + } + + new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); + ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + goto err; + + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.parent = cpu_to_le32(src_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; + + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); + SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); + + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; +err: + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; +} + +int bch2_fs_subvolumes_init(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); + INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, + bch2_subvolume_wait_for_pagecache_and_delete); + mutex_init(&c->snapshots_unlinked_lock); + return 0; +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 index 000000000000..a1003d30ab0a --- /dev/null +++ b/fs/bcachefs/subvolume.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H + +#include "darray.h" +#include "subvolume_types.h" + +enum bkey_invalid_flags; + +int bch2_check_subvols(struct bch_fs *); + +int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ + .key_invalid = bch2_subvolume_invalid, \ + .val_to_text = bch2_subvolume_to_text, \ + .min_val_size = 16, \ +}) + +int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); + +int bch2_subvolume_unlink(struct btree_trans *, u32); +int bch2_subvolume_create(struct btree_trans *, u64, u32, + u32 *, u32 *, bool); + +int bch2_fs_subvolumes_init(struct bch_fs *); + +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h new file mode 100644 index 000000000000..2d2e66a4e468 --- /dev/null +++ b/fs/bcachefs/subvolume_types.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H +#define _BCACHEFS_SUBVOLUME_TYPES_H + +#include "darray.h" + +typedef DARRAY(u32) snapshot_id_list; + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + u32 equiv; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + DECLARE_FLEX_ARRAY(struct snapshot_t, s); +}; + +typedef struct { + u32 subvol; + u64 inum; +} subvol_inum; + +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 index 000000000000..f3e12f7979d5 --- /dev/null +++ b/fs/bcachefs/super-io.c @@ -0,0 +1,1271 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "checksum.h" +#include "counters.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "journal.h" +#include "journal_sb.h" +#include "journal_seq_blacklist.h" +#include "recovery.h" +#include "replicas.h" +#include "quota.h" +#include "sb-clean.h" +#include "sb-errors.h" +#include "sb-members.h" +#include "super-io.h" +#include "super.h" +#include "trace.h" +#include "vstructs.h" + +#include <linux/backing-dev.h> +#include <linux/sort.h> + +static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { +}; + +struct bch2_metadata_version { + u16 version; + const char *name; + u64 recovery_passes; +}; + +static const struct bch2_metadata_version bch2_metadata_versions[] = { +#define x(n, v, _recovery_passes) { \ + .version = v, \ + .name = #n, \ + .recovery_passes = _recovery_passes, \ +}, + BCH_METADATA_VERSIONS() +#undef x +}; + +void bch2_version_to_text(struct printbuf *out, unsigned v) +{ + const char *str = "(unknown version)"; + + for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) + if (bch2_metadata_versions[i].version == v) { + str = bch2_metadata_versions[i].name; + break; + } + + prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); +} + +unsigned bch2_latest_compatible_version(unsigned v) +{ + if (!BCH_VERSION_MAJOR(v)) + return v; + + for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) + if (bch2_metadata_versions[i].version > v && + BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == + BCH_VERSION_MAJOR(v)) + v = bch2_metadata_versions[i].version; + + return v; +} + +u64 bch2_upgrade_recovery_passes(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + u64 ret = 0; + + for (const struct bch2_metadata_version *i = bch2_metadata_versions; + i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); + i++) + if (i->version > old_version && i->version <= new_version) { + if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) + ret |= bch2_fsck_recovery_passes(); + ret |= i->recovery_passes; + } + + return ret &= ~RECOVERY_PASS_ALL_FSCK; +} + +const char * const bch2_sb_fields[] = { +#define x(name, nr) #name, + BCH_SB_FIELDS() +#undef x + NULL +}; + +static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, + struct printbuf *); + +struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, + enum bch_sb_field_type type) +{ + struct bch_sb_field *f; + + /* XXX: need locking around superblock to access optional fields */ + + vstruct_for_each(sb, f) + if (le32_to_cpu(f->type) == type) + return f; + return NULL; +} + +static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, + struct bch_sb_field *f, + unsigned u64s) +{ + unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; + + BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); + + if (!f && !u64s) { + /* nothing to do: */ + } else if (!f) { + f = vstruct_last(sb->sb); + memset(f, 0, sizeof(u64) * u64s); + f->u64s = cpu_to_le32(u64s); + f->type = 0; + } else { + void *src, *dst; + + src = vstruct_end(f); + + if (u64s) { + f->u64s = cpu_to_le32(u64s); + dst = vstruct_end(f); + } else { + dst = f; + } + + memmove(dst, src, vstruct_end(sb->sb) - src); + + if (dst > src) + memset(src, 0, dst - src); + } + + sb->sb->u64s = cpu_to_le32(sb_u64s); + + return u64s ? f : NULL; +} + +void bch2_sb_field_delete(struct bch_sb_handle *sb, + enum bch_sb_field_type type) +{ + struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); + + if (f) + __bch2_sb_field_resize(sb, f, 0); +} + +/* Superblock realloc/free: */ + +void bch2_free_super(struct bch_sb_handle *sb) +{ + kfree(sb->bio); + if (!IS_ERR_OR_NULL(sb->bdev)) + blkdev_put(sb->bdev, sb->holder); + kfree(sb->holder); + kfree(sb->sb_name); + + kfree(sb->sb); + memset(sb, 0, sizeof(*sb)); +} + +int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) +{ + size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); + size_t new_buffer_size; + struct bch_sb *new_sb; + struct bio *bio; + + if (sb->bdev) + new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); + + new_buffer_size = roundup_pow_of_two(new_bytes); + + if (sb->sb && sb->buffer_size >= new_buffer_size) + return 0; + + if (sb->sb && sb->have_layout) { + u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; + + if (new_bytes > max_bytes) { + pr_err("%pg: superblock too big: want %zu but have %llu", + sb->bdev, new_bytes, max_bytes); + return -BCH_ERR_ENOSPC_sb; + } + } + + if (sb->buffer_size >= new_buffer_size && sb->sb) + return 0; + + if (dynamic_fault("bcachefs:add:super_realloc")) + return -BCH_ERR_ENOMEM_sb_realloc_injected; + + new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); + if (!new_sb) + return -BCH_ERR_ENOMEM_sb_buf_realloc; + + sb->sb = new_sb; + + if (sb->have_bio) { + unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); + + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) + return -BCH_ERR_ENOMEM_sb_bio_realloc; + + bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); + + kfree(sb->bio); + sb->bio = bio; + } + + sb->buffer_size = new_buffer_size; + + return 0; +} + +struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, + enum bch_sb_field_type type, + unsigned u64s) +{ + struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) + return NULL; + + if (sb->fs_sb) { + struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); + struct bch_dev *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + /* XXX: we're not checking that offline device have enough space */ + + for_each_online_member(ca, c, i) { + struct bch_sb_handle *dev_sb = &ca->disk_sb; + + if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { + percpu_ref_put(&ca->ref); + return NULL; + } + } + } + + f = bch2_sb_field_get_id(sb->sb, type); + f = __bch2_sb_field_resize(sb, f, u64s); + if (f) + f->type = cpu_to_le32(type); + return f; +} + +/* Superblock validate: */ + +static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) +{ + u64 offset, prev_offset, max_sectors; + unsigned i; + + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); + + if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && + !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { + prt_printf(out, "Not a bcachefs superblock layout"); + return -BCH_ERR_invalid_sb_layout; + } + + if (layout->layout_type != 0) { + prt_printf(out, "Invalid superblock layout type %u", + layout->layout_type); + return -BCH_ERR_invalid_sb_layout_type; + } + + if (!layout->nr_superblocks) { + prt_printf(out, "Invalid superblock layout: no superblocks"); + return -BCH_ERR_invalid_sb_layout_nr_superblocks; + } + + if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { + prt_printf(out, "Invalid superblock layout: too many superblocks"); + return -BCH_ERR_invalid_sb_layout_nr_superblocks; + } + + max_sectors = 1 << layout->sb_max_size_bits; + + prev_offset = le64_to_cpu(layout->sb_offset[0]); + + for (i = 1; i < layout->nr_superblocks; i++) { + offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset < prev_offset + max_sectors) { + prt_printf(out, "Invalid superblock layout: superblocks overlap\n" + " (sb %u ends at %llu next starts at %llu", + i - 1, prev_offset + max_sectors, offset); + return -BCH_ERR_invalid_sb_layout_superblocks_overlap; + } + prev_offset = offset; + } + + return 0; +} + +static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) +{ + u16 version = le16_to_cpu(sb->version); + u16 version_min = le16_to_cpu(sb->version_min); + + if (!bch2_version_compatible(version)) { + prt_str(out, "Unsupported superblock version "); + bch2_version_to_text(out, version); + prt_str(out, " (min "); + bch2_version_to_text(out, bcachefs_metadata_version_min); + prt_str(out, ", max "); + bch2_version_to_text(out, bcachefs_metadata_version_current); + prt_str(out, ")"); + return -BCH_ERR_invalid_sb_version; + } + + if (!bch2_version_compatible(version_min)) { + prt_str(out, "Unsupported superblock version_min "); + bch2_version_to_text(out, version_min); + prt_str(out, " (min "); + bch2_version_to_text(out, bcachefs_metadata_version_min); + prt_str(out, ", max "); + bch2_version_to_text(out, bcachefs_metadata_version_current); + prt_str(out, ")"); + return -BCH_ERR_invalid_sb_version; + } + + if (version_min > version) { + prt_str(out, "Bad minimum version "); + bch2_version_to_text(out, version_min); + prt_str(out, ", greater than version field "); + bch2_version_to_text(out, version); + return -BCH_ERR_invalid_sb_version; + } + + return 0; +} + +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, + int rw) +{ + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; + struct bch_sb_field_members_v1 *mi; + enum bch_opt_id opt_id; + u16 block_size; + int ret; + + ret = bch2_sb_compatible(sb, out); + if (ret) + return ret; + + if (sb->features[1] || + (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { + prt_printf(out, "Filesystem has incompatible features"); + return -BCH_ERR_invalid_sb_features; + } + + block_size = le16_to_cpu(sb->block_size); + + if (block_size > PAGE_SECTORS) { + prt_printf(out, "Block size too big (got %u, max %u)", + block_size, PAGE_SECTORS); + return -BCH_ERR_invalid_sb_block_size; + } + + if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { + prt_printf(out, "Bad user UUID (got zeroes)"); + return -BCH_ERR_invalid_sb_uuid; + } + + if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { + prt_printf(out, "Bad internal UUID (got zeroes)"); + return -BCH_ERR_invalid_sb_uuid; + } + + if (!sb->nr_devices || + sb->nr_devices > BCH_SB_MEMBERS_MAX) { + prt_printf(out, "Bad number of member devices %u (max %u)", + sb->nr_devices, BCH_SB_MEMBERS_MAX); + return -BCH_ERR_invalid_sb_too_many_members; + } + + if (sb->dev_idx >= sb->nr_devices) { + prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", + sb->dev_idx, sb->nr_devices); + return -BCH_ERR_invalid_sb_dev_idx; + } + + if (!sb->time_precision || + le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { + prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", + le32_to_cpu(sb->time_precision), NSEC_PER_SEC); + return -BCH_ERR_invalid_sb_time_precision; + } + + if (rw == READ) { + /* + * Been seeing a bug where these are getting inexplicably + * zeroed, so we're now validating them, but we have to be + * careful not to preven people's filesystems from mounting: + */ + if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); + if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); + + if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); + } + + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { + const struct bch_option *opt = bch2_opt_table + opt_id; + + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, opt_id); + + prt_printf(out, "Invalid option "); + ret = bch2_opt_validate(opt, v, out); + if (ret) + return ret; + + printbuf_reset(out); + } + } + + /* validate layout */ + ret = validate_sb_layout(&sb->layout, out); + if (ret) + return ret; + + vstruct_for_each(sb, f) { + if (!f->u64s) { + prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", + le32_to_cpu(f->type)); + return -BCH_ERR_invalid_sb_field_size; + } + + if (vstruct_next(f) > vstruct_last(sb)) { + prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", + le32_to_cpu(f->type)); + return -BCH_ERR_invalid_sb_field_size; + } + } + + /* members must be validated first: */ + mi = bch2_sb_field_get(sb, members_v1); + if (!mi) { + prt_printf(out, "Invalid superblock: member info area missing"); + return -BCH_ERR_invalid_sb_members_missing; + } + + ret = bch2_sb_field_validate(sb, &mi->field, out); + if (ret) + return ret; + + vstruct_for_each(sb, f) { + if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) + continue; + + ret = bch2_sb_field_validate(sb, f, out); + if (ret) + return ret; + } + + return 0; +} + +/* device open: */ + +static void bch2_sb_update(struct bch_fs *c) +{ + struct bch_sb *src = c->disk_sb.sb; + struct bch_dev *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + c->sb.uuid = src->uuid; + c->sb.user_uuid = src->user_uuid; + c->sb.version = le16_to_cpu(src->version); + c->sb.version_min = le16_to_cpu(src->version_min); + c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); + + c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); + c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; + + /* XXX this is wrong, we need a 96 or 128 bit integer type */ + c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), + c->sb.nsec_per_time_unit); + c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); + + c->sb.features = le64_to_cpu(src->features[0]); + c->sb.compat = le64_to_cpu(src->compat[0]); + + for_each_member_device(ca, c, i) { + struct bch_member m = bch2_sb_member_get(src, i); + ca->mi = bch2_mi_to_cpu(&m); + } +} + +static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) +{ + struct bch_sb_field *src_f, *dst_f; + struct bch_sb *dst = dst_handle->sb; + unsigned i; + + dst->version = src->version; + dst->version_min = src->version_min; + dst->seq = src->seq; + dst->uuid = src->uuid; + dst->user_uuid = src->user_uuid; + memcpy(dst->label, src->label, sizeof(dst->label)); + + dst->block_size = src->block_size; + dst->nr_devices = src->nr_devices; + + dst->time_base_lo = src->time_base_lo; + dst->time_base_hi = src->time_base_hi; + dst->time_precision = src->time_precision; + + memcpy(dst->flags, src->flags, sizeof(dst->flags)); + memcpy(dst->features, src->features, sizeof(dst->features)); + memcpy(dst->compat, src->compat, sizeof(dst->compat)); + + for (i = 0; i < BCH_SB_FIELD_NR; i++) { + int d; + + if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) + continue; + + src_f = bch2_sb_field_get_id(src, i); + dst_f = bch2_sb_field_get_id(dst, i); + + d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - + (dst_f ? le32_to_cpu(dst_f->u64s) : 0); + if (d > 0) { + int ret = bch2_sb_realloc(dst_handle, + le32_to_cpu(dst_handle->sb->u64s) + d); + + if (ret) + return ret; + + dst = dst_handle->sb; + dst_f = bch2_sb_field_get_id(dst, i); + } + + dst_f = __bch2_sb_field_resize(dst_handle, dst_f, + src_f ? le32_to_cpu(src_f->u64s) : 0); + + if (src_f) + memcpy(dst_f, src_f, vstruct_bytes(src_f)); + } + + return 0; +} + +int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) +{ + int ret; + + lockdep_assert_held(&c->sb_lock); + + ret = bch2_sb_realloc(&c->disk_sb, 0) ?: + __copy_super(&c->disk_sb, src) ?: + bch2_sb_replicas_to_cpu_replicas(c) ?: + bch2_sb_disk_groups_to_cpu(c); + if (ret) + return ret; + + bch2_sb_update(c); + return 0; +} + +int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) +{ + return __copy_super(&ca->disk_sb, c->disk_sb.sb); +} + +/* read superblock: */ + +static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) +{ + struct bch_csum csum; + size_t bytes; + int ret; +reread: + bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); + sb->bio->bi_iter.bi_sector = offset; + bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); + + ret = submit_bio_wait(sb->bio); + if (ret) { + prt_printf(err, "IO error: %i", ret); + return ret; + } + + if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && + !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { + prt_printf(err, "Not a bcachefs superblock"); + return -BCH_ERR_invalid_sb_magic; + } + + ret = bch2_sb_compatible(sb->sb, err); + if (ret) + return ret; + + bytes = vstruct_bytes(sb->sb); + + if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", + bytes, 512UL << sb->sb->layout.sb_max_size_bits); + return -BCH_ERR_invalid_sb_too_big; + } + + if (bytes > sb->buffer_size) { + ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); + if (ret) + return ret; + goto reread; + } + + if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { + prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); + return -BCH_ERR_invalid_sb_csum_type; + } + + /* XXX: verify MACs */ + csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), + null_nonce(), sb->sb); + + if (bch2_crc_cmp(csum, sb->sb->csum)) { + prt_printf(err, "bad checksum"); + return -BCH_ERR_invalid_sb_csum; + } + + sb->seq = le64_to_cpu(sb->sb->seq); + + return 0; +} + +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + u64 offset = opt_get(*opts, sb); + struct bch_sb_layout layout; + struct printbuf err = PRINTBUF; + __le64 *i; + int ret; +#ifndef __KERNEL__ +retry: +#endif + memset(sb, 0, sizeof(*sb)); + sb->mode = BLK_OPEN_READ; + sb->have_bio = true; + sb->holder = kmalloc(1, GFP_KERNEL); + if (!sb->holder) + return -ENOMEM; + + sb->sb_name = kstrdup(path, GFP_KERNEL); + if (!sb->sb_name) + return -ENOMEM; + +#ifndef __KERNEL__ + if (opt_get(*opts, direct_io) == false) + sb->mode |= BLK_OPEN_BUFFERED; +#endif + + if (!opt_get(*opts, noexcl)) + sb->mode |= BLK_OPEN_EXCL; + + if (!opt_get(*opts, nochanges)) + sb->mode |= BLK_OPEN_WRITE; + + sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->bdev) && + PTR_ERR(sb->bdev) == -EACCES && + opt_get(*opts, read_only)) { + sb->mode &= ~BLK_OPEN_WRITE; + + sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->bdev)) + opt_set(*opts, nochanges, true); + } + + if (IS_ERR(sb->bdev)) { + ret = PTR_ERR(sb->bdev); + goto out; + } + + ret = bch2_sb_realloc(sb, 0); + if (ret) { + prt_printf(&err, "error allocating memory for superblock"); + goto err; + } + + if (bch2_fs_init_fault("read_super")) { + prt_printf(&err, "dynamic fault"); + ret = -EFAULT; + goto err; + } + + ret = read_one_super(sb, offset, &err); + if (!ret) + goto got_super; + + if (opt_defined(*opts, sb)) + goto err; + + printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", + path, err.buf); + printbuf_reset(&err); + + /* + * Error reading primary superblock - read location of backup + * superblocks: + */ + bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); + sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; + /* + * use sb buffer to read layout, since sb buffer is page aligned but + * layout won't be: + */ + bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); + + ret = submit_bio_wait(sb->bio); + if (ret) { + prt_printf(&err, "IO error: %i", ret); + goto err; + } + + memcpy(&layout, sb->sb, sizeof(layout)); + ret = validate_sb_layout(&layout, &err); + if (ret) + goto err; + + for (i = layout.sb_offset; + i < layout.sb_offset + layout.nr_superblocks; i++) { + offset = le64_to_cpu(*i); + + if (offset == opt_get(*opts, sb)) + continue; + + ret = read_one_super(sb, offset, &err); + if (!ret) + goto got_super; + } + + goto err; + +got_super: + if (le16_to_cpu(sb->sb->block_size) << 9 < + bdev_logical_block_size(sb->bdev) && + opt_get(*opts, direct_io)) { +#ifndef __KERNEL__ + opt_set(*opts, direct_io, false); + bch2_free_super(sb); + goto retry; +#endif + prt_printf(&err, "block size (%u) smaller than device block size (%u)", + le16_to_cpu(sb->sb->block_size) << 9, + bdev_logical_block_size(sb->bdev)); + ret = -BCH_ERR_block_size_too_small; + goto err; + } + + ret = 0; + sb->have_layout = true; + + ret = bch2_sb_validate(sb, &err, READ); + if (ret) { + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", + path, err.buf); + goto err_no_print; + } +out: + printbuf_exit(&err); + return ret; +err: + printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", + path, err.buf); +err_no_print: + bch2_free_super(sb); + goto out; +} + +/* write superblock: */ + +static void write_super_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + + /* XXX: return errors directly */ + + if (bch2_dev_io_err_on(bio->bi_status, ca, + bio_data_dir(bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "superblock %s error: %s", + bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) + ca->sb_write_error = 1; + + closure_put(&ca->fs->sb_write); + percpu_ref_put(&ca->io_ref); +} + +static void read_back_super(struct bch_fs *c, struct bch_dev *ca) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); + closure_bio_submit(bio, &c->sb_write); +} + +static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + sb->offset = sb->layout.sb_offset[idx]; + + SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); + sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), + null_nonce(), sb); + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bch2_bio_map(bio, sb, + roundup((size_t) vstruct_bytes(sb), + bdev_logical_block_size(ca->disk_sb.bdev))); + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); + closure_bio_submit(bio, &c->sb_write); +} + +int bch2_write_super(struct bch_fs *c) +{ + struct closure *cl = &c->sb_write; + struct bch_dev *ca; + struct printbuf err = PRINTBUF; + unsigned i, sb = 0, nr_wrote; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; + unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; + int ret = 0; + + trace_and_count(c, write_super, c, _RET_IP_); + + if (c->opts.very_degraded) + degraded_flags |= BCH_FORCE_IF_LOST; + + lockdep_assert_held(&c->sb_lock); + + closure_init_stack(cl); + memset(&sb_written, 0, sizeof(sb_written)); + + /* Make sure we're using the new magic numbers: */ + c->disk_sb.sb->magic = BCHFS_MAGIC; + c->disk_sb.sb->layout.magic = BCHFS_MAGIC; + + le64_add_cpu(&c->disk_sb.sb->seq, 1); + + if (test_bit(BCH_FS_ERROR, &c->flags)) + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); + if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); + + SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); + + bch2_sb_counters_from_cpu(c); + bch2_sb_members_from_cpu(c); + bch2_sb_members_cpy_v2_v1(&c->disk_sb); + bch2_sb_errors_from_cpu(c); + + for_each_online_member(ca, c, i) + bch2_sb_from_fs(c, ca); + + for_each_online_member(ca, c, i) { + printbuf_reset(&err); + + ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); + percpu_ref_put(&ca->io_ref); + goto out; + } + } + + if (c->opts.nochanges) + goto out; + + /* + * Defer writing the superblock until filesystem initialization is + * complete - don't write out a partly initialized superblock: + */ + if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) + goto out; + + for_each_online_member(ca, c, i) { + __set_bit(ca->dev_idx, sb_written.d); + ca->sb_write_error = 0; + } + + for_each_online_member(ca, c, i) + read_back_super(c, ca); + closure_sync(cl); + + for_each_online_member(ca, c, i) { + if (ca->sb_write_error) + continue; + + if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { + bch2_fs_fatal_error(c, + "Superblock write was silently dropped! (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -BCH_ERR_erofs_sb_err; + goto out; + } + + if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { + bch2_fs_fatal_error(c, + "Superblock modified by another process (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -BCH_ERR_erofs_sb_err; + goto out; + } + } + + do { + wrote = false; + for_each_online_member(ca, c, i) + if (!ca->sb_write_error && + sb < ca->disk_sb.sb->layout.nr_superblocks) { + write_one_super(c, ca, sb); + wrote = true; + } + closure_sync(cl); + sb++; + } while (wrote); + + for_each_online_member(ca, c, i) { + if (ca->sb_write_error) + __clear_bit(ca->dev_idx, sb_written.d); + else + ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); + } + + nr_wrote = dev_mask_nr(&sb_written); + + can_mount_with_written = + bch2_have_enough_devs(c, sb_written, degraded_flags, false); + + for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + sb_written.d[i] = ~sb_written.d[i]; + + can_mount_without_written = + bch2_have_enough_devs(c, sb_written, degraded_flags, false); + + /* + * If we would be able to mount _without_ the devices we successfully + * wrote superblocks to, we weren't able to write to enough devices: + * + * Exception: if we can mount without the successes because we haven't + * written anything (new filesystem), we continue if we'd be able to + * mount with the devices we did successfully write to: + */ + if (bch2_fs_fatal_err_on(!nr_wrote || + !can_mount_with_written || + (can_mount_without_written && + !can_mount_with_written), c, + "Unable to write superblock to sufficient devices (from %ps)", + (void *) _RET_IP_)) + ret = -1; +out: + /* Make new options visible after they're persistent: */ + bch2_sb_update(c); + printbuf_exit(&err); + return ret; +} + +void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) +{ + mutex_lock(&c->sb_lock); + if (!(c->sb.features & (1ULL << feat))) { + c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); + + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); +} + +/* Downgrade if superblock is at a higher version than currently supported: */ +void bch2_sb_maybe_downgrade(struct bch_fs *c) +{ + lockdep_assert_held(&c->sb_lock); + + /* + * Downgrade, if superblock is at a higher version than currently + * supported: + */ + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + if (c->sb.version > bcachefs_metadata_version_current) + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + if (c->sb.version_min > bcachefs_metadata_version_current) + c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); + c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); +} + +void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) +{ + lockdep_assert_held(&c->sb_lock); + + c->disk_sb.sb->version = cpu_to_le16(new_version); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); +} + +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { +#define x(f, nr) \ + [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, + BCH_SB_FIELDS() +#undef x +}; + +static const struct bch_sb_field_ops bch2_sb_field_null_ops; + +static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) +{ + return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) + ? bch2_sb_field_ops[type] + : &bch2_sb_field_null_ops; +} + +static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + unsigned type = le32_to_cpu(f->type); + struct printbuf field_err = PRINTBUF; + const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); + int ret; + + ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; + if (ret) { + prt_printf(err, "Invalid superblock section %s: %s", + bch2_sb_fields[type], field_err.buf); + prt_newline(err); + bch2_sb_field_to_text(err, sb, f); + } + + printbuf_exit(&field_err); + return ret; +} + +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + unsigned type = le32_to_cpu(f->type); + const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + if (type < BCH_SB_FIELD_NR) + prt_printf(out, "%s", bch2_sb_fields[type]); + else + prt_printf(out, "(unknown field %u)", type); + + prt_printf(out, " (size %zu):", vstruct_bytes(f)); + prt_newline(out); + + if (ops->to_text) { + printbuf_indent_add(out, 2); + ops->to_text(out, sb, f); + printbuf_indent_sub(out, 2); + } +} + +void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) +{ + unsigned i; + + prt_printf(out, "Type: %u", l->layout_type); + prt_newline(out); + + prt_str(out, "Superblock max size: "); + prt_units_u64(out, 512 << l->sb_max_size_bits); + prt_newline(out); + + prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); + prt_newline(out); + + prt_str(out, "Offsets: "); + for (i = 0; i < l->nr_superblocks; i++) { + if (i) + prt_str(out, ", "); + prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); + } + prt_newline(out); +} + +void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + bool print_layout, unsigned fields) +{ + struct bch_sb_field *f; + u64 fields_have = 0; + unsigned nr_devices = 0; + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 44); + + for (int i = 0; i < sb->nr_devices; i++) + nr_devices += bch2_dev_exists(sb, i); + + prt_printf(out, "External UUID:"); + prt_tab(out); + pr_uuid(out, sb->user_uuid.b); + prt_newline(out); + + prt_printf(out, "Internal UUID:"); + prt_tab(out); + pr_uuid(out, sb->uuid.b); + prt_newline(out); + + prt_str(out, "Device index:"); + prt_tab(out); + prt_printf(out, "%u", sb->dev_idx); + prt_newline(out); + + prt_str(out, "Label:"); + prt_tab(out); + prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); + prt_newline(out); + + prt_str(out, "Version:"); + prt_tab(out); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_newline(out); + + prt_str(out, "Version upgrade complete:"); + prt_tab(out); + bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); + prt_newline(out); + + prt_printf(out, "Oldest version on disk:"); + prt_tab(out); + bch2_version_to_text(out, le16_to_cpu(sb->version_min)); + prt_newline(out); + + prt_printf(out, "Created:"); + prt_tab(out); + if (sb->time_base_lo) + bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + else + prt_printf(out, "(not set)"); + prt_newline(out); + + prt_printf(out, "Sequence number:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(sb->seq)); + prt_newline(out); + + prt_printf(out, "Superblock size:"); + prt_tab(out); + prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_newline(out); + + prt_printf(out, "Clean:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); + prt_newline(out); + + prt_printf(out, "Devices:"); + prt_tab(out); + prt_printf(out, "%u", nr_devices); + prt_newline(out); + + prt_printf(out, "Sections:"); + vstruct_for_each(sb, f) + fields_have |= 1 << le32_to_cpu(f->type); + prt_tab(out); + prt_bitflags(out, bch2_sb_fields, fields_have); + prt_newline(out); + + prt_printf(out, "Features:"); + prt_tab(out); + prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); + prt_newline(out); + + prt_printf(out, "Compat features:"); + prt_tab(out); + prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "Options:"); + prt_newline(out); + printbuf_indent_add(out, 2); + { + enum bch_opt_id id; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, id); + + prt_printf(out, "%s:", opt->attr.name); + prt_tab(out); + bch2_opt_to_text(out, NULL, sb, opt, v, + OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); + prt_newline(out); + } + } + } + + printbuf_indent_sub(out, 2); + + if (print_layout) { + prt_newline(out); + prt_printf(out, "layout:"); + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_sb_layout_to_text(out, &sb->layout); + printbuf_indent_sub(out, 2); + } + + vstruct_for_each(sb, f) + if (fields & (1 << le32_to_cpu(f->type))) { + prt_newline(out); + bch2_sb_field_to_text(out, sb, f); + } +} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h new file mode 100644 index 000000000000..f5abd102bff7 --- /dev/null +++ b/fs/bcachefs/super-io.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_IO_H +#define _BCACHEFS_SUPER_IO_H + +#include "extents.h" +#include "eytzinger.h" +#include "super_types.h" +#include "super.h" +#include "sb-members.h" + +#include <asm/byteorder.h> + +static inline bool bch2_version_compatible(u16 version) +{ + return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && + version >= bcachefs_metadata_version_min; +} + +void bch2_version_to_text(struct printbuf *, unsigned); +unsigned bch2_latest_compatible_version(unsigned); + +u64 bch2_upgrade_recovery_passes(struct bch_fs *c, + unsigned, + unsigned); + +static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) +{ + return le32_to_cpu(f->u64s) * sizeof(u64); +} + +#define field_to_type(_f, _name) \ + container_of_or_null(_f, struct bch_sb_field_##_name, field) + +struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type); +#define bch2_sb_field_get(_sb, _name) \ + field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name) + +struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *, + enum bch_sb_field_type, unsigned); +#define bch2_sb_field_resize(_sb, _name, _u64s) \ + field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name) + +void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); + +extern const char * const bch2_sb_fields[]; + +struct bch_sb_field_ops { + int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); +}; + +static inline __le64 bch2_sb_magic(struct bch_fs *c) +{ + __le64 ret; + + memcpy(&ret, &c->sb.uuid, sizeof(ret)); + return ret; +} + +static inline __u64 jset_magic(struct bch_fs *c) +{ + return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); +} + +static inline __u64 bset_magic(struct bch_fs *c) +{ + return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); +} + +int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); +int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); + +void bch2_free_super(struct bch_sb_handle *); +int bch2_sb_realloc(struct bch_sb_handle *, unsigned); + +int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); +int bch2_write_super(struct bch_fs *); +void __bch2_check_set_feature(struct bch_fs *, unsigned); + +static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) +{ + if (!(c->sb.features & (1ULL << feat))) + __bch2_check_set_feature(c, feat); +} + +void bch2_sb_maybe_downgrade(struct bch_fs *); +void bch2_sb_upgrade(struct bch_fs *, unsigned); + +void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); +void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); +void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); + +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 index 000000000000..818ec467a06b --- /dev/null +++ b/fs/bcachefs/super.c @@ -0,0 +1,2030 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and + * figure out what to do with it. + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "bkey_sort.h" +#include "btree_cache.h" +#include "btree_gc.h" +#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_write_buffer.h" +#include "buckets_waiting_for_journal.h" +#include "chardev.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "counters.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" +#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" +#include "fsck.h" +#include "inode.h" +#include "io_read.h" +#include "io_write.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "move.h" +#include "migrate.h" +#include "movinggc.h" +#include "nocow_locking.h" +#include "quota.h" +#include "rebalance.h" +#include "recovery.h" +#include "replicas.h" +#include "sb-clean.h" +#include "sb-errors.h" +#include "sb-members.h" +#include "snapshot.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" +#include "sysfs.h" +#include "trace.h" + +#include <linux/backing-dev.h> +#include <linux/blkdev.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/idr.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/random.h> +#include <linux/sysfs.h> +#include <crypto/hash.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); +MODULE_DESCRIPTION("bcachefs filesystem"); +MODULE_SOFTDEP("pre: crc32c"); +MODULE_SOFTDEP("pre: crc64"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: chacha20"); +MODULE_SOFTDEP("pre: poly1305"); +MODULE_SOFTDEP("pre: xxhash"); + +#define KTYPE(type) \ +static const struct attribute_group type ## _group = { \ + .attrs = type ## _files \ +}; \ + \ +static const struct attribute_group *type ## _groups[] = { \ + &type ## _group, \ + NULL \ +}; \ + \ +static const struct kobj_type type ## _ktype = { \ + .release = type ## _release, \ + .sysfs_ops = &type ## _sysfs_ops, \ + .default_groups = type ## _groups \ +} + +static void bch2_fs_release(struct kobject *); +static void bch2_dev_release(struct kobject *); +static void bch2_fs_counters_release(struct kobject *k) +{ +} + +static void bch2_fs_internal_release(struct kobject *k) +{ +} + +static void bch2_fs_opts_dir_release(struct kobject *k) +{ +} + +static void bch2_fs_time_stats_release(struct kobject *k) +{ +} + +KTYPE(bch2_fs); +KTYPE(bch2_fs_counters); +KTYPE(bch2_fs_internal); +KTYPE(bch2_fs_opts_dir); +KTYPE(bch2_fs_time_stats); +KTYPE(bch2_dev); + +static struct kset *bcachefs_kset; +static LIST_HEAD(bch_fs_list); +static DEFINE_MUTEX(bch_fs_list_lock); + +DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); + +static void bch2_dev_free(struct bch_dev *); +static int bch2_dev_alloc(struct bch_fs *, unsigned); +static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); +static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); + +struct bch_fs *bch2_dev_to_fs(dev_t dev) +{ + struct bch_fs *c; + struct bch_dev *ca; + unsigned i; + + mutex_lock(&bch_fs_list_lock); + rcu_read_lock(); + + list_for_each_entry(c, &bch_fs_list, list) + for_each_member_device_rcu(ca, c, i, NULL) + if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { + closure_get(&c->cl); + goto found; + } + c = NULL; +found: + rcu_read_unlock(); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) +{ + struct bch_fs *c; + + lockdep_assert_held(&bch_fs_list_lock); + + list_for_each_entry(c, &bch_fs_list, list) + if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) + return c; + + return NULL; +} + +struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) +{ + struct bch_fs *c; + + mutex_lock(&bch_fs_list_lock); + c = __bch2_uuid_to_fs(uuid); + if (c) + closure_get(&c->cl); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +static void bch2_dev_usage_journal_reserve(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, nr = 0, u64s = + ((sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / + sizeof(u64); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + nr++; + rcu_read_unlock(); + + bch2_journal_entry_res_resize(&c->journal, + &c->dev_usage_journal_res, u64s * nr); +} + +/* Filesystem RO/RW: */ + +/* + * For startup/shutdown of RW stuff, the dependencies are: + * + * - foreground writes depend on copygc and rebalance (to free up space) + * + * - copygc and rebalance depend on mark and sweep gc (they actually probably + * don't because they either reserve ahead of time or don't block if + * allocations fail, but allocations can require mark and sweep gc to run + * because of generation number wraparound) + * + * - all of the above depends on the allocator threads + * + * - allocator depends on the journal (when it rewrites prios and gens) + */ + +static void __bch2_fs_read_only(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, clean_passes = 0; + u64 seq = 0; + + bch2_fs_ec_stop(c); + bch2_open_buckets_stop(c, NULL, true); + bch2_rebalance_stop(c); + bch2_copygc_stop(c); + bch2_gc_thread_stop(c); + bch2_fs_ec_flush(c); + + bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", + journal_cur_seq(&c->journal)); + + do { + clean_passes++; + + if (bch2_btree_interior_updates_flush(c) || + bch2_journal_flush_all_pins(&c->journal) || + bch2_btree_flush_all_writes(c) || + seq != atomic64_read(&c->journal.seq)) { + seq = atomic64_read(&c->journal.seq); + clean_passes = 0; + } + } while (clean_passes < 2); + + bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", + journal_cur_seq(&c->journal)); + + if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + bch2_fs_journal_stop(&c->journal); + + /* + * After stopping journal: + */ + for_each_member_device(ca, c, i) + bch2_dev_allocator_remove(c, ca); +} + +#ifndef BCH_WRITE_REF_DEBUG +static void bch2_writes_disabled(struct percpu_ref *writes) +{ + struct bch_fs *c = container_of(writes, struct bch_fs, writes); + + set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch2_read_only_wait); +} +#endif + +void bch2_fs_read_only(struct bch_fs *c) +{ + if (!test_bit(BCH_FS_RW, &c->flags)) { + bch2_journal_reclaim_stop(&c->journal); + return; + } + + BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + + /* + * Block new foreground-end write operations from starting - any new + * writes will return -EROFS: + */ + set_bit(BCH_FS_GOING_RO, &c->flags); +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_kill(&c->writes); +#else + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + bch2_write_ref_put(c, i); +#endif + + /* + * If we're not doing an emergency shutdown, we want to wait on + * outstanding writes to complete so they don't see spurious errors due + * to shutting down the allocator: + * + * If we are doing an emergency shutdown outstanding writes may + * hang until we shutdown the allocator so we don't want to wait + * on outstanding writes before shutting everything down - but + * we do need to wait on them before returning and signalling + * that going RO is complete: + */ + wait_event(bch2_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || + test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + + __bch2_fs_read_only(c); + + wait_event(bch2_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + + clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + clear_bit(BCH_FS_GOING_RO, &c->flags); + + if (!bch2_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && + test_bit(BCH_FS_STARTED, &c->flags) && + test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && + !c->opts.norecovery) { + BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); + BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); + BUG_ON(c->btree_write_buffer.state.nr); + + bch_verbose(c, "marking filesystem clean"); + bch2_fs_mark_clean(c); + } + + clear_bit(BCH_FS_RW, &c->flags); +} + +static void bch2_fs_read_only_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, read_only_work); + + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); +} + +static void bch2_fs_read_only_async(struct bch_fs *c) +{ + queue_work(system_long_wq, &c->read_only_work); +} + +bool bch2_fs_emergency_read_only(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + + bch2_journal_halt(&c->journal); + bch2_fs_read_only_async(c); + + wake_up(&bch2_read_only_wait); + return ret; +} + +static int bch2_fs_read_write_late(struct bch_fs *c) +{ + int ret; + + /* + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + * + * Ideally we'd start copygc/rebalance earlier instead of waiting for + * all of recovery/fsck to complete: + */ + ret = bch2_copygc_start(c); + if (ret) { + bch_err(c, "error starting copygc thread"); + return ret; + } + + ret = bch2_rebalance_start(c); + if (ret) { + bch_err(c, "error starting rebalance thread"); + return ret; + } + + return 0; +} + +static int __bch2_fs_read_write(struct bch_fs *c, bool early) +{ + struct bch_dev *ca; + unsigned i; + int ret; + + if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { + bch_err(c, "cannot go rw, unfixed btree errors"); + return -BCH_ERR_erofs_unfixed_errors; + } + + if (test_bit(BCH_FS_RW, &c->flags)) + return 0; + + if (c->opts.norecovery) + return -BCH_ERR_erofs_norecovery; + + /* + * nochanges is used for fsck -n mode - we have to allow going rw + * during recovery for that to work: + */ + if (c->opts.nochanges && (!early || c->opts.read_only)) + return -BCH_ERR_erofs_nochanges; + + bch_info(c, "going read-write"); + + ret = bch2_sb_members_v2_init(c); + if (ret) + goto err; + + ret = bch2_fs_mark_dirty(c); + if (ret) + goto err; + + clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + + /* + * First journal write must be a flush write: after a clean shutdown we + * don't read the journal, so the first journal write may end up + * overwriting whatever was there previously, and there must always be + * at least one non-flush write in the journal or recovery will fail: + */ + set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + + set_bit(BCH_FS_RW, &c->flags); + set_bit(BCH_FS_WAS_RW, &c->flags); + +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_reinit(&c->writes); +#else + for (i = 0; i < BCH_WRITE_REF_NR; i++) { + BUG_ON(atomic_long_read(&c->writes[i])); + atomic_long_inc(&c->writes[i]); + } +#endif + + ret = bch2_gc_thread_start(c); + if (ret) { + bch_err(c, "error starting gc thread"); + return ret; + } + + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) + goto err; + + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) + goto err; + } + + bch2_do_discards(c); + bch2_do_invalidates(c); + bch2_do_stripe_deletes(c); + bch2_do_pending_node_rewrites(c); + return 0; +err: + if (test_bit(BCH_FS_RW, &c->flags)) + bch2_fs_read_only(c); + else + __bch2_fs_read_only(c); + return ret; +} + +int bch2_fs_read_write(struct bch_fs *c) +{ + return __bch2_fs_read_write(c, false); +} + +int bch2_fs_read_write_early(struct bch_fs *c) +{ + lockdep_assert_held(&c->state_lock); + + return __bch2_fs_read_write(c, true); +} + +/* Filesystem startup/shutdown: */ + +static void __bch2_fs_free(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_exit(&c->times[i]); + + bch2_free_pending_node_rewrites(c); + bch2_fs_sb_errors_exit(c); + bch2_fs_counters_exit(c); + bch2_fs_snapshots_exit(c); + bch2_fs_quota_exit(c); + bch2_fs_fs_io_direct_exit(c); + bch2_fs_fs_io_buffered_exit(c); + bch2_fs_fsio_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_nocow_locking_exit(c); + bch2_fs_io_write_exit(c); + bch2_fs_io_read_exit(c); + bch2_fs_buckets_waiting_for_journal_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_iter_exit(c); + bch2_fs_btree_key_cache_exit(&c->btree_key_cache); + bch2_fs_btree_cache_exit(c); + bch2_fs_replicas_exit(c); + bch2_fs_journal_exit(&c->journal); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); + bch2_fs_compress_exit(c); + bch2_journal_keys_put_initial(c); + BUG_ON(atomic_read(&c->journal_keys.ref)); + bch2_fs_btree_write_buffer_exit(c); + percpu_free_rwsem(&c->mark_lock); + free_percpu(c->online_reserved); + + darray_exit(&c->btree_roots_extra); + free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); + mempool_exit(&c->btree_bounce_pool); + bioset_exit(&c->btree_bio); + mempool_exit(&c->fill_iter); +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_exit(&c->writes); +#endif + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); + kfree(c->unused_inode_hints); + + if (c->write_ref_wq) + destroy_workqueue(c->write_ref_wq); + if (c->io_complete_wq) + destroy_workqueue(c->io_complete_wq); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); + if (c->btree_io_complete_wq) + destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_update_wq) + destroy_workqueue(c->btree_update_wq); + + bch2_free_super(&c->disk_sb); + kvpfree(c, sizeof(*c)); + module_put(THIS_MODULE); +} + +static void bch2_fs_release(struct kobject *kobj) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + __bch2_fs_free(c); +} + +void __bch2_fs_stop(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + bch_verbose(c, "shutting down"); + + set_bit(BCH_FS_STOPPING, &c->flags); + + cancel_work_sync(&c->journal_seq_blacklist_gc_work); + + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); + + for_each_member_device(ca, c, i) + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + + if (c->kobj.state_in_sysfs) + kobject_del(&c->kobj); + + bch2_fs_debug_exit(c); + bch2_fs_chardev_exit(c); + + kobject_put(&c->counters_kobj); + kobject_put(&c->time_stats); + kobject_put(&c->opts_dir); + kobject_put(&c->internal); + + /* btree prefetch might have kicked off reads in the background: */ + bch2_btree_flush_all_reads(c); + + for_each_member_device(ca, c, i) + cancel_work_sync(&ca->io_error_work); + + cancel_work_sync(&c->read_only_work); +} + +void bch2_fs_free(struct bch_fs *c) +{ + unsigned i; + + mutex_lock(&bch_fs_list_lock); + list_del(&c->list); + mutex_unlock(&bch_fs_list_lock); + + closure_sync(&c->cl); + closure_debug_destroy(&c->cl); + + for (i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + + if (ca) { + bch2_free_super(&ca->disk_sb); + bch2_dev_free(ca); + } + } + + bch_verbose(c, "shutdown complete"); + + kobject_put(&c->kobj); +} + +void bch2_fs_stop(struct bch_fs *c) +{ + __bch2_fs_stop(c); + bch2_fs_free(c); +} + +static int bch2_fs_online(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + int ret = 0; + + lockdep_assert_held(&bch_fs_list_lock); + + if (__bch2_uuid_to_fs(c->sb.uuid)) { + bch_err(c, "filesystem UUID already open"); + return -EINVAL; + } + + ret = bch2_fs_chardev_init(c); + if (ret) { + bch_err(c, "error creating character device"); + return ret; + } + + bch2_fs_debug_init(c); + + ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + kobject_add(&c->internal, &c->kobj, "internal") ?: + kobject_add(&c->opts_dir, &c->kobj, "options") ?: + kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: + kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: + bch2_opts_create_sysfs_files(&c->opts_dir); + if (ret) { + bch_err(c, "error creating sysfs objects"); + return ret; + } + + down_write(&c->state_lock); + + for_each_member_device(ca, c, i) { + ret = bch2_dev_sysfs_online(c, ca); + if (ret) { + bch_err(c, "error creating sysfs objects"); + percpu_ref_put(&ca->ref); + goto err; + } + } + + BUG_ON(!list_empty(&c->list)); + list_add(&c->list, &bch_fs_list); +err: + up_write(&c->state_lock); + return ret; +} + +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +{ + struct bch_fs *c; + struct printbuf name = PRINTBUF; + unsigned i, iter_size; + int ret = 0; + + c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + if (!c) { + c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); + goto out; + } + + __module_get(THIS_MODULE); + + closure_init(&c->cl, NULL); + + c->kobj.kset = bcachefs_kset; + kobject_init(&c->kobj, &bch2_fs_ktype); + kobject_init(&c->internal, &bch2_fs_internal_ktype); + kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); + kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); + kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); + + c->minor = -1; + c->disk_sb.fs_sb = true; + + init_rwsem(&c->state_lock); + mutex_init(&c->sb_lock); + mutex_init(&c->replicas_gc_lock); + mutex_init(&c->btree_root_lock); + INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + + init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); + atomic_set(&c->journal_keys.ref, 1); + c->journal_keys.initial_ref_held = true; + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); + + bch2_fs_copygc_init(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_iter_init_early(c); + bch2_fs_btree_interior_update_init_early(c); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); + bch2_fs_rebalance_init(c); + bch2_fs_quota_init(c); + bch2_fs_ec_init_early(c); + bch2_fs_move_init(c); + bch2_fs_sb_errors_init_early(c); + + INIT_LIST_HEAD(&c->list); + + mutex_init(&c->usage_scratch_lock); + + mutex_init(&c->bio_bounce_pages_lock); + mutex_init(&c->snapshot_table_lock); + init_rwsem(&c->snapshot_create_lock); + + spin_lock_init(&c->btree_write_error_lock); + + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); + + INIT_LIST_HEAD(&c->journal_iters); + + INIT_LIST_HEAD(&c->fsck_error_msgs); + mutex_init(&c->fsck_error_msgs_lock); + + seqcount_init(&c->gc_pos_lock); + + seqcount_init(&c->usage_lock); + + sema_init(&c->io_in_flight, 128); + + INIT_LIST_HEAD(&c->vfs_inodes_list); + mutex_init(&c->vfs_inodes_lock); + + c->copy_gc_enabled = 1; + c->rebalance.enabled = 1; + c->promote_whole_extents = true; + + c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; + c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; + c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; + c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; + + bch2_fs_btree_cache_init_early(&c->btree_cache); + + mutex_init(&c->sectors_available_lock); + + ret = percpu_init_rwsem(&c->mark_lock); + if (ret) + goto err; + + mutex_lock(&c->sb_lock); + ret = bch2_sb_to_fs(c, sb); + mutex_unlock(&c->sb_lock); + + if (ret) + goto err; + + pr_uuid(&name, c->sb.user_uuid.b); + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + + /* Compat: */ + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); + + c->opts = bch2_opts_default; + ret = bch2_opts_from_sb(&c->opts, sb); + if (ret) + goto err; + + bch2_opts_apply(&c->opts, opts); + + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; + if (c->opts.inodes_use_key_cache) + c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; + c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; + + c->block_bits = ilog2(block_sectors(c)); + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); + + if (bch2_fs_init_fault("fs_alloc")) { + bch_err(c, "fs_alloc fault injected"); + ret = -EFAULT; + goto err; + } + + iter_size = sizeof(struct sort_iter) + + (btree_blocks(c) + 1) * 2 * + sizeof(struct sort_iter_set); + + c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || + !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->io_complete_wq = alloc_workqueue("bcachefs_io", + WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0)) || +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +#endif + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || + bioset_init(&c->btree_bio, 1, + max(offsetof(struct btree_read_bio, bio), + offsetof(struct btree_write_bio, wbio.bio)), + BIOSET_NEED_BVECS) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->online_reserved = alloc_percpu(u64)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, + sizeof(u64), GFP_KERNEL))) { + ret = -BCH_ERR_ENOMEM_fs_other_alloc; + goto err; + } + + ret = bch2_fs_counters_init(c) ?: + bch2_fs_sb_errors_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: + bch2_fs_journal_init(&c->journal) ?: + bch2_fs_replicas_init(c) ?: + bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_subvolumes_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_io_write_init(c) ?: + bch2_fs_nocow_locking_init(c) ?: + bch2_fs_encryption_init(c) ?: + bch2_fs_compress_init(c) ?: + bch2_fs_ec_init(c) ?: + bch2_fs_fsio_init(c) ?: + bch2_fs_fs_io_buffered_init(c) ?: + bch2_fs_fs_io_direct_init(c); + if (ret) + goto err; + + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, i) && + bch2_dev_alloc(c, i)) { + ret = -EEXIST; + goto err; + } + + bch2_journal_entry_res_resize(&c->journal, + &c->btree_root_journal_res, + BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); + bch2_dev_usage_journal_reserve(c); + bch2_journal_entry_res_resize(&c->journal, + &c->clock_journal_res, + (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); + + mutex_lock(&bch_fs_list_lock); + ret = bch2_fs_online(c); + mutex_unlock(&bch_fs_list_lock); + + if (ret) + goto err; +out: + return c; +err: + bch2_fs_free(c); + c = ERR_PTR(ret); + goto out; +} + +noinline_for_stack +static void print_mount_opts(struct bch_fs *c) +{ + enum bch_opt_id i; + struct printbuf p = PRINTBUF; + bool first = true; + + prt_str(&p, "mounting version "); + bch2_version_to_text(&p, c->sb.version); + + if (c->opts.read_only) { + prt_str(&p, " opts="); + first = false; + prt_printf(&p, "ro"); + } + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + + if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + prt_str(&p, first ? " opts=" : ","); + first = false; + bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); + } + + bch_info(c, "%s", p.buf); + printbuf_exit(&p); +} + +int bch2_fs_start(struct bch_fs *c) +{ + struct bch_dev *ca; + time64_t now = ktime_get_real_seconds(); + unsigned i; + int ret; + + print_mount_opts(c); + + down_write(&c->state_lock); + + BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); + + mutex_lock(&c->sb_lock); + + ret = bch2_sb_members_v2_init(c); + if (ret) { + mutex_unlock(&c->sb_lock); + goto err; + } + + for_each_online_member(ca, c, i) + bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now); + + mutex_unlock(&c->sb_lock); + + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); + if (ret) + goto err; + + ret = bch2_opts_check_may_set(c); + if (ret) + goto err; + + if (bch2_fs_init_fault("fs_start")) { + bch_err(c, "fs_start fault injected"); + ret = -EINVAL; + goto err; + } + + set_bit(BCH_FS_STARTED, &c->flags); + + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); + } else { + ret = !test_bit(BCH_FS_RW, &c->flags) + ? bch2_fs_read_write(c) + : bch2_fs_read_write_late(c); + if (ret) + goto err; + } + + ret = 0; +out: + up_write(&c->state_lock); + return ret; +err: + bch_err_msg(c, ret, "starting filesystem"); + goto out; +} + +static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) +{ + struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); + + if (le16_to_cpu(sb->block_size) != block_sectors(c)) + return -BCH_ERR_mismatched_block_size; + + if (le16_to_cpu(m.bucket_size) < + BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) + return -BCH_ERR_bucket_size_too_small; + + return 0; +} + +static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +{ + struct bch_sb *newest = + le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; + + if (!uuid_equal(&fs->uuid, &sb->uuid)) + return -BCH_ERR_device_not_a_member_of_filesystem; + + if (!bch2_dev_exists(newest, sb->dev_idx)) + return -BCH_ERR_device_has_been_removed; + + if (fs->block_size != sb->block_size) + return -BCH_ERR_mismatched_block_size; + + return 0; +} + +/* Device startup/shutdown: */ + +static void bch2_dev_release(struct kobject *kobj) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + + kfree(ca); +} + +static void bch2_dev_free(struct bch_dev *ca) +{ + cancel_work_sync(&ca->io_error_work); + + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + + if (ca->kobj.state_in_sysfs) + kobject_del(&ca->kobj); + + bch2_free_super(&ca->disk_sb); + bch2_dev_journal_exit(ca); + + free_percpu(ca->io_done); + bioset_exit(&ca->replica_set); + bch2_dev_buckets_free(ca); + free_page((unsigned long) ca->sb_read_scratch); + + bch2_time_stats_exit(&ca->io_latency[WRITE]); + bch2_time_stats_exit(&ca->io_latency[READ]); + + percpu_ref_exit(&ca->io_ref); + percpu_ref_exit(&ca->ref); + kobject_put(&ca->kobj); +} + +static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) +{ + + lockdep_assert_held(&c->state_lock); + + if (percpu_ref_is_zero(&ca->io_ref)) + return; + + __bch2_dev_read_only(c, ca); + + reinit_completion(&ca->io_ref_completion); + percpu_ref_kill(&ca->io_ref); + wait_for_completion(&ca->io_ref_completion); + + if (ca->kobj.state_in_sysfs) { + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } + + bch2_free_super(&ca->disk_sb); + bch2_dev_journal_exit(ca); +} + +static void bch2_dev_ref_complete(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, ref); + + complete(&ca->ref_completion); +} + +static void bch2_dev_io_ref_complete(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); + + complete(&ca->io_ref_completion); +} + +static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) +{ + int ret; + + if (!c->kobj.state_in_sysfs) + return 0; + + if (!ca->kobj.state_in_sysfs) { + ret = kobject_add(&ca->kobj, &c->kobj, + "dev-%u", ca->dev_idx); + if (ret) + return ret; + } + + if (ca->disk_sb.bdev) { + struct kobject *block = bdev_kobj(ca->disk_sb.bdev); + + ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); + if (ret) + return ret; + + ret = sysfs_create_link(&ca->kobj, block, "block"); + if (ret) + return ret; + } + + return 0; +} + +static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + struct bch_member *member) +{ + struct bch_dev *ca; + unsigned i; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + return NULL; + + kobject_init(&ca->kobj, &bch2_dev_ktype); + init_completion(&ca->ref_completion); + init_completion(&ca->io_ref_completion); + + init_rwsem(&ca->bucket_lock); + + INIT_WORK(&ca->io_error_work, bch2_io_error_work); + + bch2_time_stats_init(&ca->io_latency[READ]); + bch2_time_stats_init(&ca->io_latency[WRITE]); + + ca->mi = bch2_mi_to_cpu(member); + + for (i = 0; i < ARRAY_SIZE(member->errors); i++) + atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); + + ca->uuid = member->uuid; + + ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / btree_sectors(c)); + + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, + 0, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || + bch2_dev_buckets_alloc(c, ca) || + bioset_init(&ca->replica_set, 4, + offsetof(struct bch_write_bio, bio), 0) || + !(ca->io_done = alloc_percpu(*ca->io_done))) + goto err; + + return ca; +err: + bch2_dev_free(ca); + return NULL; +} + +static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, + unsigned dev_idx) +{ + ca->dev_idx = dev_idx; + __set_bit(ca->dev_idx, ca->self.d); + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + ca->fs = c; + rcu_assign_pointer(c->devs[ca->dev_idx], ca); + + if (bch2_dev_sysfs_online(c, ca)) + pr_warn("error creating sysfs objects"); +} + +static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) +{ + struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + struct bch_dev *ca = NULL; + int ret = 0; + + if (bch2_fs_init_fault("dev_alloc")) + goto err; + + ca = __bch2_dev_alloc(c, &member); + if (!ca) + goto err; + + ca->fs = c; + + bch2_dev_attach(c, ca, dev_idx); + return ret; +err: + if (ca) + bch2_dev_free(ca); + return -BCH_ERR_ENOMEM_dev_alloc; +} + +static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) +{ + unsigned ret; + + if (bch2_dev_is_online(ca)) { + bch_err(ca, "already have device online in slot %u", + sb->sb->dev_idx); + return -BCH_ERR_device_already_online; + } + + if (get_capacity(sb->bdev->bd_disk) < + ca->mi.bucket_size * ca->mi.nbuckets) { + bch_err(ca, "cannot online: device too small"); + return -BCH_ERR_device_size_too_small; + } + + BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); + + ret = bch2_dev_journal_init(ca, sb->sb); + if (ret) + return ret; + + /* Commit: */ + ca->disk_sb = *sb; + memset(sb, 0, sizeof(*sb)); + + ca->dev = ca->disk_sb.bdev->bd_dev; + + percpu_ref_reinit(&ca->io_ref); + + return 0; +} + +static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) +{ + struct bch_dev *ca; + int ret; + + lockdep_assert_held(&c->state_lock); + + if (le64_to_cpu(sb->sb->seq) > + le64_to_cpu(c->disk_sb.sb->seq)) + bch2_sb_to_fs(c, sb->sb); + + BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || + !c->devs[sb->sb->dev_idx]); + + ca = bch_dev_locked(c, sb->sb->dev_idx); + + ret = __bch2_dev_attach_bdev(ca, sb); + if (ret) + return ret; + + bch2_dev_sysfs_online(c, ca); + + if (c->sb.nr_devices == 1) + snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev); + snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev); + + rebalance_wakeup(c); + return 0; +} + +/* Device management: */ + +/* + * Note: this function is also used by the error paths - when a particular + * device sees an error, we call it to determine whether we can just set the + * device RO, or - if this function returns false - we'll set the whole + * filesystem RO: + * + * XXX: maybe we should be more explicit about whether we're changing state + * because we got an error or what have you? + */ +bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + struct bch_devs_mask new_online_devs; + struct bch_dev *ca2; + int i, nr_rw = 0, required; + + lockdep_assert_held(&c->state_lock); + + switch (new_state) { + case BCH_MEMBER_STATE_rw: + return true; + case BCH_MEMBER_STATE_ro: + if (ca->mi.state != BCH_MEMBER_STATE_rw) + return true; + + /* do we have enough devices to write to? */ + for_each_member_device(ca2, c, i) + if (ca2 != ca) + nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; + + required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) + ? c->opts.metadata_replicas + : c->opts.metadata_replicas_required, + !(flags & BCH_FORCE_IF_DATA_DEGRADED) + ? c->opts.data_replicas + : c->opts.data_replicas_required); + + return nr_rw >= required; + case BCH_MEMBER_STATE_failed: + case BCH_MEMBER_STATE_spare: + if (ca->mi.state != BCH_MEMBER_STATE_rw && + ca->mi.state != BCH_MEMBER_STATE_ro) + return true; + + /* do we have enough devices to read from? */ + new_online_devs = bch2_online_devs(c); + __clear_bit(ca->dev_idx, new_online_devs.d); + + return bch2_have_enough_devs(c, new_online_devs, flags, false); + default: + BUG(); + } +} + +static bool bch2_fs_may_start(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, flags = 0; + + if (c->opts.very_degraded) + flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; + + if (c->opts.degraded) + flags |= BCH_FORCE_IF_DEGRADED; + + if (!c->opts.degraded && + !c->opts.very_degraded) { + mutex_lock(&c->sb_lock); + + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + if (!bch2_dev_exists(c->disk_sb.sb, i)) + continue; + + ca = bch_dev_locked(c, i); + + if (!bch2_dev_is_online(ca) && + (ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro)) { + mutex_unlock(&c->sb_lock); + return false; + } + } + mutex_unlock(&c->sb_lock); + } + + return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); +} + +static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) +{ + /* + * The allocator thread itself allocates btree nodes, so stop it first: + */ + bch2_dev_allocator_remove(c, ca); + bch2_dev_journal_stop(&c->journal, ca); +} + +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +{ + lockdep_assert_held(&c->state_lock); + + BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); + + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); +} + +int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + struct bch_member *m; + int ret = 0; + + if (ca->mi.state == new_state) + return 0; + + if (!bch2_dev_state_allowed(c, ca, new_state, flags)) + return -BCH_ERR_device_state_not_allowed; + + if (new_state != BCH_MEMBER_STATE_rw) + __bch2_dev_read_only(c, ca); + + bch_notice(ca, "%s", bch2_member_states[new_state]); + + mutex_lock(&c->sb_lock); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_STATE(m, new_state); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (new_state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + + rebalance_wakeup(c); + + return ret; +} + +int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + int ret; + + down_write(&c->state_lock); + ret = __bch2_dev_set_state(c, ca, new_state, flags); + up_write(&c->state_lock); + + return ret; +} + +/* Device add/removal: */ + +static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); + int ret; + + /* + * We clear the LRU and need_discard btrees first so that we don't race + * with bch2_do_invalidates() and bch2_do_discards() + */ + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, + BTREE_TRIGGER_NORUN, NULL); + if (ret) + bch_err_msg(c, ret, "removing dev alloc info"); + + return ret; +} + +int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) +{ + struct bch_member *m; + unsigned dev_idx = ca->dev_idx, data; + int ret; + + down_write(&c->state_lock); + + /* + * We consume a reference to ca->ref, regardless of whether we succeed + * or fail: + */ + percpu_ref_put(&ca->ref); + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot remove without losing data"); + ret = -BCH_ERR_device_state_not_allowed; + goto err; + } + + __bch2_dev_read_only(c, ca); + + ret = bch2_dev_data_drop(c, ca->dev_idx, flags); + if (ret) { + bch_err_msg(ca, ret, "dropping data"); + goto err; + } + + ret = bch2_dev_remove_alloc(c, ca); + if (ret) { + bch_err_msg(ca, ret, "deleting alloc info"); + goto err; + } + + ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); + if (ret) { + bch_err_msg(ca, ret, "flushing journal"); + goto err; + } + + ret = bch2_journal_flush(&c->journal); + if (ret) { + bch_err(ca, "journal error"); + goto err; + } + + ret = bch2_replicas_gc2(c); + if (ret) { + bch_err_msg(ca, ret, "in replicas_gc2()"); + goto err; + } + + data = bch2_dev_has_data(c, ca); + if (data) { + struct printbuf data_has = PRINTBUF; + + prt_bitflags(&data_has, bch2_data_types, data); + bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); + printbuf_exit(&data_has); + ret = -EBUSY; + goto err; + } + + __bch2_dev_offline(c, ca); + + mutex_lock(&c->sb_lock); + rcu_assign_pointer(c->devs[ca->dev_idx], NULL); + mutex_unlock(&c->sb_lock); + + percpu_ref_kill(&ca->ref); + wait_for_completion(&ca->ref_completion); + + bch2_dev_free(ca); + + /* + * At this point the device object has been removed in-core, but the + * on-disk journal might still refer to the device index via sb device + * usage entries. Recovery fails if it sees usage information for an + * invalid device. Flush journal pins to push the back of the journal + * past now invalid device index references before we update the + * superblock, but after the device object has been removed so any + * further journal writes elide usage info for the device. + */ + bch2_journal_flush_all_pins(&c->journal); + + /* + * Free this device's slot in the bch_member array - all pointers to + * this device must be gone: + */ + mutex_lock(&c->sb_lock); + m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); + memset(&m->uuid, 0, sizeof(m->uuid)); + + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); + + bch2_dev_usage_journal_reserve(c); + return 0; +err: + if (ca->mi.state == BCH_MEMBER_STATE_rw && + !percpu_ref_is_zero(&ca->io_ref)) + __bch2_dev_read_write(c, ca); + up_write(&c->state_lock); + return ret; +} + +/* Add new device to running filesystem: */ +int bch2_dev_add(struct bch_fs *c, const char *path) +{ + struct bch_opts opts = bch2_opts_empty(); + struct bch_sb_handle sb; + struct bch_dev *ca = NULL; + struct bch_sb_field_members_v2 *mi; + struct bch_member dev_mi; + unsigned dev_idx, nr_devices, u64s; + struct printbuf errbuf = PRINTBUF; + struct printbuf label = PRINTBUF; + int ret; + + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + bch_err_msg(c, ret, "reading super"); + goto err; + } + + dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); + + if (BCH_MEMBER_GROUP(&dev_mi)) { + bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); + if (label.allocation_failure) { + ret = -ENOMEM; + goto err; + } + } + + ret = bch2_dev_may_add(sb.sb, c); + if (ret) { + bch_err_fn(c, ret); + goto err; + } + + ca = __bch2_dev_alloc(c, &dev_mi); + if (!ca) { + ret = -ENOMEM; + goto err; + } + + bch2_dev_usage_init(ca); + + ret = __bch2_dev_attach_bdev(ca, &sb); + if (ret) + goto err; + + ret = bch2_dev_journal_alloc(ca); + if (ret) { + bch_err_msg(c, ret, "allocating journal"); + goto err; + } + + down_write(&c->state_lock); + mutex_lock(&c->sb_lock); + + ret = bch2_sb_from_fs(c, ca); + if (ret) { + bch_err_msg(c, ret, "setting up new superblock"); + goto err_unlock; + } + + if (dynamic_fault("bcachefs:add:no_slot")) + goto no_slot; + + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) + if (!bch2_dev_exists(c->disk_sb.sb, dev_idx)) + goto have_slot; +no_slot: + ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); + goto err_unlock; + +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + + mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) { + ret = -BCH_ERR_ENOSPC_sb_members; + bch_err_msg(c, ret, "setting up new superblock"); + goto err_unlock; + } + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); + + /* success: */ + + *m = dev_mi; + m->last_mount = cpu_to_le64(ktime_get_real_seconds()); + c->disk_sb.sb->nr_devices = nr_devices; + + ca->disk_sb.sb->dev_idx = dev_idx; + bch2_dev_attach(c, ca, dev_idx); + + if (BCH_MEMBER_GROUP(&dev_mi)) { + ret = __bch2_dev_group_set(c, ca, label.buf); + if (ret) { + bch_err_msg(c, ret, "creating new label"); + goto err_unlock; + } + } + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + bch2_dev_usage_journal_reserve(c); + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + bch_err_msg(ca, ret, "marking new superblock"); + goto err_late; + } + + ret = bch2_fs_freespace_init(c); + if (ret) { + bch_err_msg(ca, ret, "initializing free space"); + goto err_late; + } + + ca->new_fs_bucket_idx = 0; + + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + + up_write(&c->state_lock); + return 0; + +err_unlock: + mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); +err: + if (ca) + bch2_dev_free(ca); + bch2_free_super(&sb); + printbuf_exit(&label); + printbuf_exit(&errbuf); + return ret; +err_late: + up_write(&c->state_lock); + ca = NULL; + goto err; +} + +/* Hot add existing device to running filesystem: */ +int bch2_dev_online(struct bch_fs *c, const char *path) +{ + struct bch_opts opts = bch2_opts_empty(); + struct bch_sb_handle sb = { NULL }; + struct bch_dev *ca; + unsigned dev_idx; + int ret; + + down_write(&c->state_lock); + + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + up_write(&c->state_lock); + return ret; + } + + dev_idx = sb.sb->dev_idx; + + ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); + if (ret) { + bch_err_msg(c, ret, "bringing %s online", path); + goto err; + } + + ret = bch2_dev_attach_bdev(c, &sb); + if (ret) + goto err; + + ca = bch_dev_locked(c, dev_idx); + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); + goto err; + } + + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + + if (!ca->mi.freespace_initialized) { + ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) + goto err; + } + + if (!ca->journal.nr) { + ret = bch2_dev_journal_alloc(ca); + bch_err_msg(ca, ret, "allocating journal"); + if (ret) + goto err; + } + + mutex_lock(&c->sb_lock); + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(ktime_get_real_seconds()); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + up_write(&c->state_lock); + return 0; +err: + up_write(&c->state_lock); + bch2_free_super(&sb); + return ret; +} + +int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) +{ + down_write(&c->state_lock); + + if (!bch2_dev_is_online(ca)) { + bch_err(ca, "Already offline"); + up_write(&c->state_lock); + return 0; + } + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot offline required disk"); + up_write(&c->state_lock); + return -BCH_ERR_device_state_not_allowed; + } + + __bch2_dev_offline(c, ca); + + up_write(&c->state_lock); + return 0; +} + +int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +{ + struct bch_member *m; + u64 old_nbuckets; + int ret = 0; + + down_write(&c->state_lock); + old_nbuckets = ca->mi.nbuckets; + + if (nbuckets < ca->mi.nbuckets) { + bch_err(ca, "Cannot shrink yet"); + ret = -EINVAL; + goto err; + } + + if (bch2_dev_is_online(ca) && + get_capacity(ca->disk_sb.bdev->bd_disk) < + ca->mi.bucket_size * nbuckets) { + bch_err(ca, "New size larger than device"); + ret = -BCH_ERR_device_size_too_small; + goto err; + } + + ret = bch2_dev_buckets_resize(c, ca, nbuckets); + if (ret) { + bch_err_msg(ca, ret, "resizing buckets"); + goto err; + } + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) + goto err; + + mutex_lock(&c->sb_lock); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(nbuckets); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (ca->mi.freespace_initialized) { + ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + if (ret) + goto err; + + /* + * XXX: this is all wrong transactionally - we'll be able to do + * this correctly after the disk space accounting rewrite + */ + ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets; + } + + bch2_recalc_capacity(c); +err: + up_write(&c->state_lock); + return ret; +} + +/* return with ref on ca->ref: */ +struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) +{ + struct bch_dev *ca; + unsigned i; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + if (!strcmp(name, ca->name)) + goto found; + ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); +found: + rcu_read_unlock(); + + return ca; +} + +/* Filesystem open: */ + +struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_opts opts) +{ + DARRAY(struct bch_sb_handle) sbs = { 0 }; + struct bch_fs *c = NULL; + struct bch_sb_handle *sb, *best = NULL; + struct printbuf errbuf = PRINTBUF; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + + if (!nr_devices) { + ret = -EINVAL; + goto err; + } + + ret = darray_make_room(&sbs, nr_devices); + if (ret) + goto err; + + for (unsigned i = 0; i < nr_devices; i++) { + struct bch_sb_handle sb = { NULL }; + + ret = bch2_read_super(devices[i], &opts, &sb); + if (ret) + goto err; + + BUG_ON(darray_push(&sbs, sb)); + } + + darray_for_each(sbs, sb) + if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + best = sb; + + darray_for_each_reverse(sbs, sb) { + if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { + pr_info("%pg has been removed, skipping", sb->bdev); + bch2_free_super(sb); + darray_remove_item(&sbs, sb); + best -= best > sb; + continue; + } + + ret = bch2_dev_in_fs(best->sb, sb->sb); + if (ret) + goto err_print; + } + + c = bch2_fs_alloc(best->sb, opts); + ret = PTR_ERR_OR_ZERO(c); + if (ret) + goto err; + + down_write(&c->state_lock); + darray_for_each(sbs, sb) { + ret = bch2_dev_attach_bdev(c, sb); + if (ret) { + up_write(&c->state_lock); + goto err; + } + } + up_write(&c->state_lock); + + if (!bch2_fs_may_start(c)) { + ret = -BCH_ERR_insufficient_devices_to_start; + goto err_print; + } + + if (!c->opts.nostart) { + ret = bch2_fs_start(c); + if (ret) + goto err; + } +out: + darray_for_each(sbs, sb) + bch2_free_super(sb); + darray_exit(&sbs); + printbuf_exit(&errbuf); + module_put(THIS_MODULE); + return c; +err_print: + pr_err("bch_fs_open err opening %s: %s", + devices[0], bch2_err_str(ret)); +err: + if (!IS_ERR_OR_NULL(c)) + bch2_fs_stop(c); + c = ERR_PTR(ret); + goto out; +} + +/* Global interfaces/init */ + +static void bcachefs_exit(void) +{ + bch2_debug_exit(); + bch2_vfs_exit(); + bch2_chardev_exit(); + bch2_btree_key_cache_exit(); + if (bcachefs_kset) + kset_unregister(bcachefs_kset); +} + +static int __init bcachefs_init(void) +{ + bch2_bkey_pack_test(); + + if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || + bch2_btree_key_cache_init() || + bch2_chardev_init() || + bch2_vfs_init() || + bch2_debug_init()) + goto err; + + return 0; +err: + bcachefs_exit(); + return -ENOMEM; +} + +#define BCH_DEBUG_PARAM(name, description) \ + bool bch2_##name; \ + module_param_named(name, bch2_##name, bool, 0644); \ + MODULE_PARM_DESC(name, description); +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +__maybe_unused +static unsigned bch2_metadata_version = bcachefs_metadata_version_current; +module_param_named(version, bch2_metadata_version, uint, 0400); + +module_exit(bcachefs_exit); +module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h new file mode 100644 index 000000000000..bf762df18012 --- /dev/null +++ b/fs/bcachefs/super.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_H +#define _BCACHEFS_SUPER_H + +#include "extents.h" + +#include "bcachefs_ioctl.h" + +#include <linux/math64.h> + +struct bch_fs *bch2_dev_to_fs(dev_t); +struct bch_fs *bch2_uuid_to_fs(__uuid_t); + +bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); +int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); +int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); + +int bch2_dev_fail(struct bch_dev *, int); +int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); +int bch2_dev_add(struct bch_fs *, const char *); +int bch2_dev_online(struct bch_fs *, const char *); +int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); +int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); +struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); + +bool bch2_fs_emergency_read_only(struct bch_fs *); +void bch2_fs_read_only(struct bch_fs *); + +int bch2_fs_read_write(struct bch_fs *); +int bch2_fs_read_write_early(struct bch_fs *); + +/* + * Only for use in the recovery/fsck path: + */ +static inline void bch2_fs_lazy_rw(struct bch_fs *c) +{ + if (!test_bit(BCH_FS_RW, &c->flags) && + !test_bit(BCH_FS_WAS_RW, &c->flags)) + bch2_fs_read_write_early(c); +} + +void __bch2_fs_stop(struct bch_fs *); +void bch2_fs_free(struct bch_fs *); +void bch2_fs_stop(struct bch_fs *); + +int bch2_fs_start(struct bch_fs *); +struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); + +#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h new file mode 100644 index 000000000000..9c1fd4ca2b10 --- /dev/null +++ b/fs/bcachefs/super_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_TYPES_H +#define _BCACHEFS_SUPER_TYPES_H + +struct bch_sb_handle { + struct bch_sb *sb; + struct block_device *bdev; + char *sb_name; + struct bio *bio; + void *holder; + size_t buffer_size; + blk_mode_t mode; + unsigned have_layout:1; + unsigned have_bio:1; + unsigned fs_sb:1; + u64 seq; +}; + +struct bch_devs_mask { + unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; +}; + +struct bch_devs_list { + u8 nr; + u8 devs[BCH_BKEY_PTRS_MAX]; +}; + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u16 group; + u8 state; + u8 discard; + u8 data_allowed; + u8 durability; + u8 freespace_initialized; + u8 valid; +}; + +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 index 000000000000..f3cb7115b530 --- /dev/null +++ b/fs/bcachefs/sysfs.c @@ -0,0 +1,1034 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#ifndef NO_BCACHEFS_SYSFS + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "sysfs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_gc.h" +#include "buckets.h" +#include "clock.h" +#include "disk_groups.h" +#include "ec.h" +#include "inode.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "movinggc.h" +#include "nocow_locking.h" +#include "opts.h" +#include "rebalance.h" +#include "replicas.h" +#include "super-io.h" +#include "tests.h" + +#include <linux/blkdev.h> +#include <linux/sort.h> +#include <linux/sched/clock.h> + +#include "util.h" + +#define SYSFS_OPS(type) \ +const struct sysfs_ops type ## _sysfs_ops = { \ + .show = type ## _show, \ + .store = type ## _store \ +} + +#define SHOW(fn) \ +static ssize_t fn ## _to_text(struct printbuf *, \ + struct kobject *, struct attribute *); \ + \ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ +{ \ + struct printbuf out = PRINTBUF; \ + ssize_t ret = fn ## _to_text(&out, kobj, attr); \ + \ + if (out.pos && out.buf[out.pos - 1] != '\n') \ + prt_newline(&out); \ + \ + if (!ret && out.allocation_failure) \ + ret = -ENOMEM; \ + \ + if (!ret) { \ + ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ + memcpy(buf, out.buf, ret); \ + } \ + printbuf_exit(&out); \ + return bch2_err_class(ret); \ +} \ + \ +static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ + struct attribute *attr) + +#define STORE(fn) \ +static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ + const char *, size_t); \ + \ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) \ +{ \ + return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ +} \ + \ +static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) + +#define __sysfs_attribute(_name, _mode) \ + static struct attribute sysfs_##_name = \ + { .name = #_name, .mode = _mode } + +#define write_attribute(n) __sysfs_attribute(n, 0200) +#define read_attribute(n) __sysfs_attribute(n, 0444) +#define rw_attribute(n) __sysfs_attribute(n, 0644) + +#define sysfs_printf(file, fmt, ...) \ +do { \ + if (attr == &sysfs_ ## file) \ + prt_printf(out, fmt "\n", __VA_ARGS__); \ +} while (0) + +#define sysfs_print(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + snprint(out, var); \ +} while (0) + +#define sysfs_hprint(file, val) \ +do { \ + if (attr == &sysfs_ ## file) \ + prt_human_readable_s64(out, val); \ +} while (0) + +#define sysfs_strtoul(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe(buf, var) ?: (ssize_t) size; \ +} while (0) + +#define sysfs_strtoul_clamp(file, var, min, max) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe_clamp(buf, var, min, max) \ + ?: (ssize_t) size; \ +} while (0) + +#define strtoul_or_return(cp) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +write_attribute(trigger_gc); +write_attribute(trigger_discards); +write_attribute(trigger_invalidates); +write_attribute(prune_cache); +write_attribute(btree_wakeup); +rw_attribute(btree_gc_periodic); +rw_attribute(gc_gens_pos); + +read_attribute(uuid); +read_attribute(minor); +read_attribute(bucket_size); +read_attribute(first_bucket); +read_attribute(nbuckets); +rw_attribute(durability); +read_attribute(io_done); +read_attribute(io_errors); +write_attribute(io_errors_reset); + +read_attribute(io_latency_read); +read_attribute(io_latency_write); +read_attribute(io_latency_stats_read); +read_attribute(io_latency_stats_write); +read_attribute(congested); + +read_attribute(btree_write_stats); + +read_attribute(btree_cache_size); +read_attribute(compression_stats); +read_attribute(journal_debug); +read_attribute(btree_updates); +read_attribute(btree_cache); +read_attribute(btree_key_cache); +read_attribute(stripes_heap); +read_attribute(open_buckets); +read_attribute(open_buckets_partial); +read_attribute(write_points); +read_attribute(nocow_lock_table); + +#ifdef BCH_WRITE_REF_DEBUG +read_attribute(write_refs); + +static const char * const bch2_write_refs[] = { +#define x(n) #n, + BCH_WRITE_REFS() +#undef x + NULL +}; + +static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_tabstop_push(out, 24); + + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { + prt_str(out, bch2_write_refs[i]); + prt_tab(out); + prt_printf(out, "%li", atomic_long_read(&c->writes[i])); + prt_newline(out); + } +} +#endif + +read_attribute(internal_uuid); +read_attribute(disk_groups); + +read_attribute(has_data); +read_attribute(alloc_debug); + +#define x(t, n, ...) read_attribute(t); +BCH_PERSISTENT_COUNTERS() +#undef x + +rw_attribute(discard); +rw_attribute(label); + +rw_attribute(copy_gc_enabled); +read_attribute(copy_gc_wait); + +rw_attribute(rebalance_enabled); +sysfs_pd_controller_attribute(rebalance); +read_attribute(rebalance_status); +rw_attribute(promote_whole_extents); + +read_attribute(new_stripes); + +read_attribute(io_timers_read); +read_attribute(io_timers_write); + +read_attribute(moving_ctxts); + +#ifdef CONFIG_BCACHEFS_TESTS +write_attribute(perf_test); +#endif /* CONFIG_BCACHEFS_TESTS */ + +#define x(_name) \ + static struct attribute sysfs_time_stat_##_name = \ + { .name = #_name, .mode = 0444 }; + BCH_TIME_STATS() +#undef x + +static struct attribute sysfs_state_rw = { + .name = "state", + .mode = 0444, +}; + +static size_t bch2_btree_cache_size(struct bch_fs *c) +{ + size_t ret = 0; + struct btree *b; + + mutex_lock(&c->btree_cache.lock); + list_for_each_entry(b, &c->btree_cache.live, list) + ret += btree_bytes(c); + + mutex_unlock(&c->btree_cache.lock); + return ret; +} + +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id id; + u64 nr_uncompressed_extents = 0, + nr_compressed_extents = 0, + nr_incompressible_extents = 0, + uncompressed_sectors = 0, + incompressible_sectors = 0, + compressed_sectors_compressed = 0, + compressed_sectors_uncompressed = 0; + int ret = 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EPERM; + + trans = bch2_trans_get(c); + + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_ptrs(id)) + continue; + + ret = for_each_btree_key2(trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bool compressed = false, uncompressed = false, incompressible = false; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + switch (p.crc.compression_type) { + case BCH_COMPRESSION_TYPE_none: + uncompressed = true; + uncompressed_sectors += k.k->size; + break; + case BCH_COMPRESSION_TYPE_incompressible: + incompressible = true; + incompressible_sectors += k.k->size; + break; + default: + compressed_sectors_compressed += + p.crc.compressed_size; + compressed_sectors_uncompressed += + p.crc.uncompressed_size; + compressed = true; + break; + } + } + + if (incompressible) + nr_incompressible_extents++; + else if (uncompressed) + nr_uncompressed_extents++; + else if (compressed) + nr_compressed_extents++; + 0; + })); + } + + bch2_trans_put(trans); + + if (ret) + return ret; + + prt_printf(out, "uncompressed:\n"); + prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); + prt_printf(out, " size: "); + prt_human_readable_u64(out, uncompressed_sectors << 9); + prt_printf(out, "\n"); + + prt_printf(out, "compressed:\n"); + prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); + prt_printf(out, " compressed size: "); + prt_human_readable_u64(out, compressed_sectors_compressed << 9); + prt_printf(out, "\n"); + prt_printf(out, " uncompressed size: "); + prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); + prt_printf(out, "\n"); + + prt_printf(out, "incompressible:\n"); + prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); + prt_printf(out, " size: "); + prt_human_readable_u64(out, incompressible_sectors << 9); + prt_printf(out, "\n"); + return 0; +} + +static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) +{ + prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); + bch2_bpos_to_text(out, c->gc_gens_pos); + prt_printf(out, "\n"); +} + +static void bch2_btree_wakeup_all(struct bch_fs *c) +{ + struct btree_trans *trans; + + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); + + if (b) + six_lock_wakeup_all(&b->lock); + + } + seqmutex_unlock(&c->btree_trans_lock); +} + +SHOW(bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + sysfs_print(minor, c->minor); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); + + if (attr == &sysfs_btree_write_stats) + bch2_btree_write_stats_to_text(out, c); + + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + + if (attr == &sysfs_gc_gens_pos) + bch2_gc_gens_pos_to_text(out, c); + + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + + sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + + if (attr == &sysfs_copy_gc_wait) + bch2_copygc_wait_to_text(out, c); + + if (attr == &sysfs_rebalance_status) + bch2_rebalance_status_to_text(out, c); + + sysfs_print(promote_whole_extents, c->promote_whole_extents); + + /* Debugging: */ + + if (attr == &sysfs_journal_debug) + bch2_journal_debug_to_text(out, &c->journal); + + if (attr == &sysfs_btree_updates) + bch2_btree_updates_to_text(out, c); + + if (attr == &sysfs_btree_cache) + bch2_btree_cache_to_text(out, c); + + if (attr == &sysfs_btree_key_cache) + bch2_btree_key_cache_to_text(out, &c->btree_key_cache); + + if (attr == &sysfs_stripes_heap) + bch2_stripes_heap_to_text(out, c); + + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c); + + if (attr == &sysfs_open_buckets_partial) + bch2_open_buckets_partial_to_text(out, c); + + if (attr == &sysfs_write_points) + bch2_write_points_to_text(out, c); + + if (attr == &sysfs_compression_stats) + bch2_compression_stats_to_text(out, c); + + if (attr == &sysfs_new_stripes) + bch2_new_stripes_to_text(out, c); + + if (attr == &sysfs_io_timers_read) + bch2_io_timers_to_text(out, &c->io_clock[READ]); + + if (attr == &sysfs_io_timers_write) + bch2_io_timers_to_text(out, &c->io_clock[WRITE]); + + if (attr == &sysfs_moving_ctxts) + bch2_fs_moving_ctxts_to_text(out, c); + +#ifdef BCH_WRITE_REF_DEBUG + if (attr == &sysfs_write_refs) + bch2_write_refs_to_text(out, c); +#endif + + if (attr == &sysfs_nocow_lock_table) + bch2_nocow_locks_to_text(out, &c->nocow_locks); + + if (attr == &sysfs_disk_groups) + bch2_disk_groups_to_text(out, c); + + return 0; +} + +STORE(bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + if (attr == &sysfs_btree_gc_periodic) { + ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) + ?: (ssize_t) size; + + wake_up_process(c->gc_thread); + return ret; + } + + if (attr == &sysfs_copy_gc_enabled) { + ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) + ?: (ssize_t) size; + + if (c->copygc_thread) + wake_up_process(c->copygc_thread); + return ret; + } + + if (attr == &sysfs_rebalance_enabled) { + ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) + ?: (ssize_t) size; + + rebalance_wakeup(c); + return ret; + } + + sysfs_pd_controller_store(rebalance, &c->rebalance.pd); + + sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); + + /* Debugging: */ + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EPERM; + + /* Debugging: */ + + if (!test_bit(BCH_FS_RW, &c->flags)) + return -EROFS; + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + } + + if (attr == &sysfs_btree_wakeup) + bch2_btree_wakeup_all(c); + + if (attr == &sysfs_trigger_gc) { + /* + * Full gc is currently incompatible with btree key cache: + */ +#if 0 + down_read(&c->state_lock); + bch2_gc(c, false, false); + up_read(&c->state_lock); +#else + bch2_gc_gens(c); +#endif + } + + if (attr == &sysfs_trigger_discards) + bch2_do_discards(c); + + if (attr == &sysfs_trigger_invalidates) + bch2_do_invalidates(c); + +#ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; + char *test = strsep(&p, " \t\n"); + char *nr_str = strsep(&p, " \t\n"); + char *threads_str = strsep(&p, " \t\n"); + unsigned threads; + u64 nr; + int ret = -EINVAL; + + if (threads_str && + !(ret = kstrtouint(threads_str, 10, &threads)) && + !(ret = bch2_strtoull_h(nr_str, &nr))) + ret = bch2_btree_perf_test(c, test, nr, threads); + kfree(tmp); + + if (ret) + size = ret; + } +#endif + return size; +} +SYSFS_OPS(bch2_fs); + +struct attribute *bch2_fs_files[] = { + &sysfs_minor, + &sysfs_btree_cache_size, + &sysfs_btree_write_stats, + + &sysfs_promote_whole_extents, + + &sysfs_compression_stats, + +#ifdef CONFIG_BCACHEFS_TESTS + &sysfs_perf_test, +#endif + NULL +}; + +/* counters dir */ + +SHOW(bch2_fs_counters) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); + u64 counter = 0; + u64 counter_since_mount = 0; + + printbuf_tabstop_push(out, 32); + + #define x(t, ...) \ + if (attr == &sysfs_##t) { \ + counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ + counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ + prt_printf(out, "since mount:"); \ + prt_tab(out); \ + prt_human_readable_u64(out, counter_since_mount); \ + prt_newline(out); \ + \ + prt_printf(out, "since filesystem creation:"); \ + prt_tab(out); \ + prt_human_readable_u64(out, counter); \ + prt_newline(out); \ + } + BCH_PERSISTENT_COUNTERS() + #undef x + return 0; +} + +STORE(bch2_fs_counters) { + return 0; +} + +SYSFS_OPS(bch2_fs_counters); + +struct attribute *bch2_fs_counters_files[] = { +#define x(t, ...) \ + &sysfs_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; +/* internal dir - just a wrapper */ + +SHOW(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); + + return bch2_fs_to_text(out, &c->kobj, attr); +} + +STORE(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); + + return bch2_fs_store(&c->kobj, attr, buf, size); +} +SYSFS_OPS(bch2_fs_internal); + +struct attribute *bch2_fs_internal_files[] = { + &sysfs_journal_debug, + &sysfs_btree_updates, + &sysfs_btree_cache, + &sysfs_btree_key_cache, + &sysfs_new_stripes, + &sysfs_stripes_heap, + &sysfs_open_buckets, + &sysfs_open_buckets_partial, + &sysfs_write_points, +#ifdef BCH_WRITE_REF_DEBUG + &sysfs_write_refs, +#endif + &sysfs_nocow_lock_table, + &sysfs_io_timers_read, + &sysfs_io_timers_write, + + &sysfs_trigger_gc, + &sysfs_trigger_discards, + &sysfs_trigger_invalidates, + &sysfs_prune_cache, + &sysfs_btree_wakeup, + + &sysfs_gc_gens_pos, + + &sysfs_copy_gc_enabled, + &sysfs_copy_gc_wait, + + &sysfs_rebalance_enabled, + &sysfs_rebalance_status, + sysfs_pd_controller_files(rebalance), + + &sysfs_moving_ctxts, + + &sysfs_internal_uuid, + + &sysfs_disk_groups, + NULL +}; + +/* options */ + +SHOW(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); + int id = opt - bch2_opt_table; + u64 v = bch2_opt_get_by_id(&c->opts, id); + + bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); + prt_char(out, '\n'); + + return 0; +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); + int ret, id = opt - bch2_opt_table; + char *tmp; + u64 v; + + /* + * We don't need to take c->writes for correctness, but it eliminates an + * unsightly error message in the dmesg log when we're RO: + */ + if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) + return -EROFS; + + tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto err; + } + + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + kfree(tmp); + + if (ret < 0) + goto err; + + ret = bch2_opt_check_may_set(c, id, v); + if (ret < 0) + goto err; + + bch2_opt_set_sb(c, opt, v); + bch2_opt_set_by_id(&c->opts, id, v); + + if ((id == Opt_background_target || + id == Opt_background_compression) && v) + bch2_set_rebalance_needs_scan(c, 0); + + ret = size; +err: + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + return ret; +} +SYSFS_OPS(bch2_fs_opts_dir); + +struct attribute *bch2_fs_opts_dir_files[] = { NULL }; + +int bch2_opts_create_sysfs_files(struct kobject *kobj) +{ + const struct bch_option *i; + int ret; + + for (i = bch2_opt_table; + i < bch2_opt_table + bch2_opts_nr; + i++) { + if (!(i->flags & OPT_FS)) + continue; + + ret = sysfs_create_file(kobj, &i->attr); + if (ret) + return ret; + } + + return 0; +} + +/* time stats */ + +SHOW(bch2_fs_time_stats) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + +#define x(name) \ + if (attr == &sysfs_time_stat_##name) \ + bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); + BCH_TIME_STATS() +#undef x + + return 0; +} + +STORE(bch2_fs_time_stats) +{ + return size; +} +SYSFS_OPS(bch2_fs_time_stats); + +struct attribute *bch2_fs_time_stats_files[] = { +#define x(name) \ + &sysfs_time_stat_##name, + BCH_TIME_STATS() +#undef x + NULL +}; + +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned i, nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstop_push(out, 8); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + + prt_tab(out); + prt_str(out, "buckets"); + prt_tab_rjust(out); + prt_str(out, "sectors"); + prt_tab_rjust(out); + prt_str(out, "fragmented"); + prt_tab_rjust(out); + prt_newline(out); + + for (i = 0; i < BCH_DATA_NR; i++) { + prt_str(out, bch2_data_types[i]); + prt_tab(out); + prt_u64(out, stats.d[i].buckets); + prt_tab_rjust(out); + prt_u64(out, stats.d[i].sectors); + prt_tab_rjust(out); + prt_u64(out, stats.d[i].fragmented); + prt_tab_rjust(out); + prt_newline(out); + } + + prt_str(out, "ec"); + prt_tab(out); + prt_u64(out, stats.buckets_ec); + prt_tab_rjust(out); + prt_newline(out); + + prt_newline(out); + + prt_printf(out, "reserves:"); + prt_newline(out); + for (i = 0; i < BCH_WATERMARK_NR; i++) { + prt_str(out, bch2_watermarks[i]); + prt_tab(out); + prt_u64(out, bch2_dev_buckets_reserved(ca, i)); + prt_tab_rjust(out); + prt_newline(out); + } + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 24); + + prt_str(out, "freelist_wait"); + prt_tab(out); + prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); + prt_newline(out); + + prt_str(out, "open buckets allocated"); + prt_tab(out); + prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); + prt_newline(out); + + prt_str(out, "open buckets this dev"); + prt_tab(out); + prt_u64(out, ca->nr_open_buckets); + prt_newline(out); + + prt_str(out, "open buckets total"); + prt_tab(out); + prt_u64(out, OPEN_BUCKETS_COUNT); + prt_newline(out); + + prt_str(out, "open_buckets_wait"); + prt_tab(out); + prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); + prt_newline(out); + + prt_str(out, "open_buckets_btree"); + prt_tab(out); + prt_u64(out, nr[BCH_DATA_btree]); + prt_newline(out); + + prt_str(out, "open_buckets_user"); + prt_tab(out); + prt_u64(out, nr[BCH_DATA_user]); + prt_newline(out); + + prt_str(out, "buckets_to_invalidate"); + prt_tab(out); + prt_u64(out, should_invalidate_buckets(ca, stats)); + prt_newline(out); + + prt_str(out, "btree reserve cache"); + prt_tab(out); + prt_u64(out, c->btree_reserve_cache_nr); + prt_newline(out); +} + +static const char * const bch2_rw[] = { + "read", + "write", + NULL +}; + +static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca) +{ + int rw, i; + + for (rw = 0; rw < 2; rw++) { + prt_printf(out, "%s:\n", bch2_rw[rw]); + + for (i = 1; i < BCH_DATA_NR; i++) + prt_printf(out, "%-12s:%12llu\n", + bch2_data_types[i], + percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); + } +} + +SHOW(bch2_dev) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; + + sysfs_printf(uuid, "%pU\n", ca->uuid.b); + + sysfs_print(bucket_size, bucket_bytes(ca)); + sysfs_print(first_bucket, ca->mi.first_bucket); + sysfs_print(nbuckets, ca->mi.nbuckets); + sysfs_print(durability, ca->mi.durability); + sysfs_print(discard, ca->mi.discard); + + if (attr == &sysfs_label) { + if (ca->mi.group) + bch2_disk_path_to_text(out, c, ca->mi.group - 1); + prt_char(out, '\n'); + } + + if (attr == &sysfs_has_data) { + prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_char(out, '\n'); + } + + if (attr == &sysfs_state_rw) { + prt_string_option(out, bch2_member_states, ca->mi.state); + prt_char(out, '\n'); + } + + if (attr == &sysfs_io_done) + dev_io_done_to_text(out, ca); + + if (attr == &sysfs_io_errors) + bch2_dev_io_errors_to_text(out, ca); + + sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); + sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); + + if (attr == &sysfs_io_latency_stats_read) + bch2_time_stats_to_text(out, &ca->io_latency[READ]); + + if (attr == &sysfs_io_latency_stats_write) + bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + + sysfs_printf(congested, "%u%%", + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + + if (attr == &sysfs_alloc_debug) + dev_alloc_debug_to_text(out, ca); + + return 0; +} + +STORE(bch2_dev) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; + struct bch_member *mi; + + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + + mutex_lock(&c->sb_lock); + mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + + if (v != BCH_MEMBER_DISCARD(mi)) { + SET_BCH_MEMBER_DISCARD(mi, v); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + if (attr == &sysfs_durability) { + u64 v = strtoul_or_return(buf); + + mutex_lock(&c->sb_lock); + mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + + if (v + 1 != BCH_MEMBER_DURABILITY(mi)) { + SET_BCH_MEMBER_DURABILITY(mi, v + 1); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + if (attr == &sysfs_label) { + char *tmp; + int ret; + + tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = bch2_dev_group_set(c, ca, strim(tmp)); + kfree(tmp); + if (ret) + return ret; + } + + if (attr == &sysfs_io_errors_reset) + bch2_dev_errors_reset(ca); + + return size; +} +SYSFS_OPS(bch2_dev); + +struct attribute *bch2_dev_files[] = { + &sysfs_uuid, + &sysfs_bucket_size, + &sysfs_first_bucket, + &sysfs_nbuckets, + &sysfs_durability, + + /* settings: */ + &sysfs_discard, + &sysfs_state_rw, + &sysfs_label, + + &sysfs_has_data, + &sysfs_io_done, + &sysfs_io_errors, + &sysfs_io_errors_reset, + + &sysfs_io_latency_read, + &sysfs_io_latency_write, + &sysfs_io_latency_stats_read, + &sysfs_io_latency_stats_write, + &sysfs_congested, + + /* debug: */ + &sysfs_alloc_debug, + NULL +}; + +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h new file mode 100644 index 000000000000..222cd5062702 --- /dev/null +++ b/fs/bcachefs/sysfs.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SYSFS_H_ +#define _BCACHEFS_SYSFS_H_ + +#include <linux/sysfs.h> + +#ifndef NO_BCACHEFS_SYSFS + +struct attribute; +struct sysfs_ops; + +extern struct attribute *bch2_fs_files[]; +extern struct attribute *bch2_fs_counters_files[]; +extern struct attribute *bch2_fs_internal_files[]; +extern struct attribute *bch2_fs_opts_dir_files[]; +extern struct attribute *bch2_fs_time_stats_files[]; +extern struct attribute *bch2_dev_files[]; + +extern const struct sysfs_ops bch2_fs_sysfs_ops; +extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; +extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; +extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; +extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; +extern const struct sysfs_ops bch2_dev_sysfs_ops; + +int bch2_opts_create_sysfs_files(struct kobject *); + +#else + +static struct attribute *bch2_fs_files[] = {}; +static struct attribute *bch2_fs_counters_files[] = {}; +static struct attribute *bch2_fs_internal_files[] = {}; +static struct attribute *bch2_fs_opts_dir_files[] = {}; +static struct attribute *bch2_fs_time_stats_files[] = {}; +static struct attribute *bch2_dev_files[] = {}; + +static const struct sysfs_ops bch2_fs_sysfs_ops; +static const struct sysfs_ops bch2_fs_counters_sysfs_ops; +static const struct sysfs_ops bch2_fs_internal_sysfs_ops; +static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; +static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; +static const struct sysfs_ops bch2_dev_sysfs_ops; + +static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } + +#endif /* NO_BCACHEFS_SYSFS */ + +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 index 000000000000..2fc9e60c754b --- /dev/null +++ b/fs/bcachefs/tests.c @@ -0,0 +1,919 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_TESTS + +#include "bcachefs.h" +#include "btree_update.h" +#include "journal_reclaim.h" +#include "snapshot.h" +#include "tests.h" + +#include "linux/kthread.h" +#include "linux/random.h" + +static void delete_test_keys(struct bch_fs *c) +{ + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), + 0, NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), + 0, NULL); + BUG_ON(ret); +} + +/* unit tests */ + +static int test_delete(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, &k.k_i, 0)); + bch_err_msg(c, ret, "update error"); + if (ret) + goto err; + + pr_info("deleting once"); + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error (first)"); + if (ret) + goto err; + + pr_info("deleting twice"); + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error (second)"); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static int test_delete_written(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, &k.k_i, 0)); + bch_err_msg(c, ret, "update error"); + if (ret) + goto err; + + bch2_trans_unlock(trans); + bch2_journal_flush_all_pins(&c->journal); + + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error"); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static int test_iterate(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie ck; + + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i; + ck.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) + goto err; + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i++); + 0; + })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + goto err; + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, + SPOS(0, U64_MAX, U32_MAX), 0, k, + ({ + BUG_ON(k.k->p.offset != --i); + 0; + })); + bch_err_msg(c, ret, "error iterating backwards"); + if (ret) + goto err; + + BUG_ON(i); +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static int test_iterate_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + delete_test_keys(c); + + pr_info("inserting test extents"); + + for (i = 0; i < nr; i += 8) { + struct bkey_i_cookie ck; + + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i + 8; + ck.k.p.snapshot = U32_MAX; + ck.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) + goto err; + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + 0; + })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + goto err; + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, + SPOS(0, U64_MAX, U32_MAX), 0, k, + ({ + BUG_ON(k.k->p.offset != i); + i = bkey_start_offset(k.k); + 0; + })); + bch_err_msg(c, ret, "error iterating backwards"); + if (ret) + goto err; + + BUG_ON(i); +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static int test_iterate_slots(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie ck; + + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i * 2; + ck.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) + goto err; + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i); + i += 2; + 0; + })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + goto err; + + BUG_ON(i != nr * 2); + + pr_info("iterating forwards by slots"); + + i = 0; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i >= nr * 2) + break; + + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); + + i++; + 0; + })); + if (ret < 0) { + bch_err_msg(c, ret, "error iterating forwards by slots"); + goto err; + } + ret = 0; +err: + bch2_trans_put(trans); + return ret; +} + +static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i += 16) { + struct bkey_i_cookie ck; + + bkey_cookie_init(&ck.k_i); + ck.k.p.offset = i + 16; + ck.k.p.snapshot = U32_MAX; + ck.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0); + bch_err_msg(c, ret, "insert error"); + if (ret) + goto err; + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + 0; + })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + goto err; + + BUG_ON(i != nr); + + pr_info("iterating forwards by slots"); + + i = 0; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i == nr) + break; + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + 0; + })); + bch_err_msg(c, ret, "error iterating forwards by slots"); + if (ret) + goto err; + ret = 0; +err: + bch2_trans_put(trans); + return 0; +} + +/* + * XXX: we really want to make sure we've got a btree with depth > 0 for these + * tests + */ +static int test_peek_end(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return 0; +} + +static int test_peek_end_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0); + + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return 0; +} + +/* extent unit tests */ + +static u64 test_version; + +static int insert_test_extent(struct bch_fs *c, + u64 start, u64 end) +{ + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; + k.k_i.k.p.snapshot = U32_MAX; + k.k_i.k.size = end - start; + k.k_i.k.version.lo = test_version++; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0); + bch_err_fn(c, ret); + return ret; +} + +static int __test_extent_overwrite(struct bch_fs *c, + u64 e1_start, u64 e1_end, + u64 e2_start, u64 e2_end) +{ + int ret; + + ret = insert_test_extent(c, e1_start, e1_end) ?: + insert_test_extent(c, e2_start, e2_end); + + delete_test_keys(c); + return ret; +} + +static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 0, 64, 0, 32) ?: + __test_extent_overwrite(c, 8, 64, 0, 32); +} + +static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 0, 64, 32, 64) ?: + __test_extent_overwrite(c, 0, 64, 32, 72); +} + +static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 0, 64, 32, 40); +} + +static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 32, 64, 0, 64) ?: + __test_extent_overwrite(c, 32, 64, 0, 128) ?: + __test_extent_overwrite(c, 32, 64, 32, 64) ?: + __test_extent_overwrite(c, 32, 64, 32, 128); +} + +static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) +{ + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k_i.k.p.inode = inum; + k.k_i.k.p.offset = start + len; + k.k_i.k.p.snapshot = snapid; + k.k_i.k.size = len; + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + bch_err_fn(c, ret); + return ret; +} + +static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) +{ + return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */ + insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?: + insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?: + insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */ + insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?: + insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?: + insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX); +} + +/* snapshot unit tests */ + +/* Test skipping over keys in unrelated snapshots: */ +static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie cookie; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = snapid_hi; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); + if (ret) + return ret; + + trans = bch2_trans_get(c); + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, snapid_lo), 0); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + + BUG_ON(k.k->p.snapshot != U32_MAX); + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static int test_snapshots(struct bch_fs *c, u64 nr) +{ + struct bkey_i_cookie cookie; + u32 snapids[2]; + u32 snapid_subvols[2] = { 1, 1 }; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = U32_MAX; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0); + if (ret) + return ret; + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_snapshot_node_create(trans, U32_MAX, + snapids, + snapid_subvols, + 2)); + if (ret) + return ret; + + if (snapids[0] > snapids[1]) + swap(snapids[0], snapids[1]); + + ret = test_snapshot_filter(c, snapids[0], snapids[1]); + bch_err_msg(c, ret, "from test_snapshot_filter"); + return ret; +} + +/* perf tests */ + +static u64 test_rand(void) +{ + u64 v; + + get_random_bytes(&v, sizeof(v)); + return v; +} + +static int rand_insert(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct bkey_i_cookie k; + int ret = 0; + u64 i; + + for (i = 0; i < nr; i++) { + bkey_cookie_init(&k.k_i); + k.k.p.offset = test_rand(); + k.k.p.snapshot = U32_MAX; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); + if (ret) + break; + } + + bch2_trans_put(trans); + return ret; +} + +static int rand_insert_multi(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct bkey_i_cookie k[8]; + int ret = 0; + unsigned j; + u64 i; + + for (i = 0; i < nr; i += ARRAY_SIZE(k)) { + for (j = 0; j < ARRAY_SIZE(k); j++) { + bkey_cookie_init(&k[j].k_i); + k[j].k.p.offset = test_rand(); + k[j].k.p.snapshot = U32_MAX; + } + + ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); + if (ret) + break; + } + + bch2_trans_put(trans); + return ret; +} + +static int rand_lookup(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + u64 i; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { + bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + ret = bkey_err(k); + if (ret) + break; + } + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static int rand_mixed_trans(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i_cookie *cookie, + u64 i, u64 pos) +{ + struct bkey_s_c k; + int ret; + + bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + bch_err_msg(trans->c, ret, "lookup error"); + if (ret) + return ret; + + if (!(i & 3) && k.k) { + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter->pos; + ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); + } + + return ret; +} + +static int rand_mixed(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_i_cookie cookie; + int ret = 0; + u64 i, rand; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { + rand = test_rand(); + ret = commit_do(trans, NULL, NULL, 0, + rand_mixed_trans(trans, &iter, &cookie, i, rand)); + if (ret) + break; + } + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + return ret; +} + +static int __do_delete(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k) + goto err; + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int rand_delete(struct bch_fs *c, u64 nr) +{ + struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; + u64 i; + + for (i = 0; i < nr; i++) { + struct bpos pos = SPOS(0, test_rand(), U32_MAX); + + ret = commit_do(trans, NULL, NULL, 0, + __do_delete(trans, pos)); + if (ret) + break; + } + + bch2_trans_put(trans); + return ret; +} + +static int seq_insert(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie insert; + + bkey_cookie_init(&insert.k_i); + + return bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, + NULL, NULL, 0, ({ + if (iter.pos.offset >= nr) + break; + insert.k.p = iter.pos; + bch2_trans_update(trans, &iter, &insert.k_i, 0); + }))); +} + +static int seq_lookup(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + + return bch2_trans_run(c, + for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, + 0)); +} + +static int seq_overwrite(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + + return bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_INTENT, k, + NULL, NULL, 0, ({ + struct bkey_i_cookie u; + + bkey_reassemble(&u.k_i, k); + bch2_trans_update(trans, &iter, &u.k_i, 0); + }))); +} + +static int seq_delete(struct bch_fs *c, u64 nr) +{ + return bch2_btree_delete_range(c, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), + 0, NULL); +} + +typedef int (*perf_test_fn)(struct bch_fs *, u64); + +struct test_job { + struct bch_fs *c; + u64 nr; + unsigned nr_threads; + perf_test_fn fn; + + atomic_t ready; + wait_queue_head_t ready_wait; + + atomic_t done; + struct completion done_completion; + + u64 start; + u64 finish; + int ret; +}; + +static int btree_perf_test_thread(void *data) +{ + struct test_job *j = data; + int ret; + + if (atomic_dec_and_test(&j->ready)) { + wake_up(&j->ready_wait); + j->start = sched_clock(); + } else { + wait_event(j->ready_wait, !atomic_read(&j->ready)); + } + + ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); + if (ret) { + bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); + j->ret = ret; + } + + if (atomic_dec_and_test(&j->done)) { + j->finish = sched_clock(); + complete(&j->done_completion); + } + + return 0; +} + +int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + u64 nr, unsigned nr_threads) +{ + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; + char name_buf[20]; + struct printbuf nr_buf = PRINTBUF; + struct printbuf per_sec_buf = PRINTBUF; + unsigned i; + u64 time; + + atomic_set(&j.ready, nr_threads); + init_waitqueue_head(&j.ready_wait); + + atomic_set(&j.done, nr_threads); + init_completion(&j.done_completion); + +#define perf_test(_test) \ + if (!strcmp(testname, #_test)) j.fn = _test + + perf_test(rand_insert); + perf_test(rand_insert_multi); + perf_test(rand_lookup); + perf_test(rand_mixed); + perf_test(rand_delete); + + perf_test(seq_insert); + perf_test(seq_lookup); + perf_test(seq_overwrite); + perf_test(seq_delete); + + /* a unit test, not a perf test: */ + perf_test(test_delete); + perf_test(test_delete_written); + perf_test(test_iterate); + perf_test(test_iterate_extents); + perf_test(test_iterate_slots); + perf_test(test_iterate_slots_extents); + perf_test(test_peek_end); + perf_test(test_peek_end_extents); + + perf_test(test_extent_overwrite_front); + perf_test(test_extent_overwrite_back); + perf_test(test_extent_overwrite_middle); + perf_test(test_extent_overwrite_all); + perf_test(test_extent_create_overlapping); + + perf_test(test_snapshots); + + if (!j.fn) { + pr_err("unknown test %s", testname); + return -EINVAL; + } + + //pr_info("running test %s:", testname); + + if (nr_threads == 1) + btree_perf_test_thread(&j); + else + for (i = 0; i < nr_threads; i++) + kthread_run(btree_perf_test_thread, &j, + "bcachefs perf test[%u]", i); + + while (wait_for_completion_interruptible(&j.done_completion)) + ; + + time = j.finish - j.start; + + scnprintf(name_buf, sizeof(name_buf), "%s:", testname); + prt_human_readable_u64(&nr_buf, nr); + prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); + printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", + name_buf, nr_buf.buf, nr_threads, + div_u64(time, NSEC_PER_SEC), + div_u64(time * nr_threads, nr), + per_sec_buf.buf); + printbuf_exit(&per_sec_buf); + printbuf_exit(&nr_buf); + return j.ret; +} + +#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h new file mode 100644 index 000000000000..c73b18aea7e0 --- /dev/null +++ b/fs/bcachefs/tests.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_TEST_H +#define _BCACHEFS_TEST_H + +struct bch_fs; + +#ifdef CONFIG_BCACHEFS_TESTS + +int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); + +#else + +#endif /* CONFIG_BCACHEFS_TESTS */ + +#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c new file mode 100644 index 000000000000..dc48b52b01b4 --- /dev/null +++ b/fs/bcachefs/trace.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_types.h" +#include "buckets.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "btree_update_interior.h" +#include "keylist.h" +#include "move_types.h" +#include "opts.h" +#include "six.h" + +#include <linux/blktrace_api.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h new file mode 100644 index 000000000000..fd49b63562c3 --- /dev/null +++ b/fs/bcachefs/trace.h @@ -0,0 +1,1327 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs + +#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BCACHEFS_H + +#include <linux/tracepoint.h> + +#define TRACE_BPOS_entries(name) \ + __field(u64, name##_inode ) \ + __field(u64, name##_offset ) \ + __field(u32, name##_snapshot ) + +#define TRACE_BPOS_assign(dst, src) \ + __entry->dst##_inode = (src).inode; \ + __entry->dst##_offset = (src).offset; \ + __entry->dst##_snapshot = (src).snapshot + +DECLARE_EVENT_CLASS(bpos, + TP_PROTO(const struct bpos *p), + TP_ARGS(p), + + TP_STRUCT__entry( + TRACE_BPOS_entries(p) + ), + + TP_fast_assign( + TRACE_BPOS_assign(p, *p); + ), + + TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) +); + +DECLARE_EVENT_CLASS(bkey, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k), + + TP_STRUCT__entry( + __string(k, k ) + ), + + TP_fast_assign( + __assign_str(k, k); + ), + + TP_printk("%s", __get_str(k)) +); + +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->level, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + +DECLARE_EVENT_CLASS(bch_fs, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c), + + TP_STRUCT__entry( + __field(dev_t, dev ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + ), + + TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) +); + +DECLARE_EVENT_CLASS(bio, + TP_PROTO(struct bio *bio), + TP_ARGS(bio), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; + __entry->sector = bio->bi_iter.bi_sector; + __entry->nr_sector = bio->bi_iter.bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); + ), + + TP_printk("%d,%d %s %llu + %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector) +); + +/* super-io.c: */ +TRACE_EVENT(write_super, + TP_PROTO(struct bch_fs *c, unsigned long ip), + TP_ARGS(c, ip), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->ip = ip; + ), + + TP_printk("%d,%d for %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + (void *) __entry->ip) +); + +/* io.c: */ + +DEFINE_EVENT(bio, read_promote, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +TRACE_EVENT(read_nopromote, + TP_PROTO(struct bch_fs *c, int ret), + TP_ARGS(c, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __array(char, ret, 32 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); + ), + + TP_printk("%d,%d ret %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ret) +); + +DEFINE_EVENT(bio, read_bounce, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_split, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_retry, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_reuse_race, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +/* Journal */ + +DEFINE_EVENT(bch_fs, journal_full, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, journal_entry_full, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bio, journal_write, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +TRACE_EVENT(journal_reclaim_start, + TP_PROTO(struct bch_fs *c, bool direct, bool kicked, + u64 min_nr, u64 min_key_cache, + u64 btree_cache_dirty, u64 btree_cache_total, + u64 btree_key_cache_dirty, u64 btree_key_cache_total), + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, + btree_cache_dirty, btree_cache_total, + btree_key_cache_dirty, btree_key_cache_total), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(bool, direct ) + __field(bool, kicked ) + __field(u64, min_nr ) + __field(u64, min_key_cache ) + __field(u64, btree_cache_dirty ) + __field(u64, btree_cache_total ) + __field(u64, btree_key_cache_dirty ) + __field(u64, btree_key_cache_total ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->direct = direct; + __entry->kicked = kicked; + __entry->min_nr = min_nr; + __entry->min_key_cache = min_key_cache; + __entry->btree_cache_dirty = btree_cache_dirty; + __entry->btree_cache_total = btree_cache_total; + __entry->btree_key_cache_dirty = btree_key_cache_dirty; + __entry->btree_key_cache_total = btree_key_cache_total; + ), + + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->direct, + __entry->kicked, + __entry->min_nr, + __entry->min_key_cache, + __entry->btree_cache_dirty, + __entry->btree_cache_total, + __entry->btree_key_cache_dirty, + __entry->btree_key_cache_total) +); + +TRACE_EVENT(journal_reclaim_finish, + TP_PROTO(struct bch_fs *c, u64 nr_flushed), + TP_ARGS(c, nr_flushed), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, nr_flushed ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->nr_flushed = nr_flushed; + ), + + TP_printk("%d,%d flushed %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_flushed) +); + +/* bset.c: */ + +DEFINE_EVENT(bpos, bkey_pack_pos_fail, + TP_PROTO(const struct bpos *p), + TP_ARGS(p) +); + +/* Btree cache: */ + +TRACE_EVENT(btree_cache_scan, + TP_PROTO(long nr_to_scan, long can_free, long ret), + TP_ARGS(nr_to_scan, can_free, ret), + + TP_STRUCT__entry( + __field(long, nr_to_scan ) + __field(long, can_free ) + __field(long, ret ) + ), + + TP_fast_assign( + __entry->nr_to_scan = nr_to_scan; + __entry->can_free = can_free; + __entry->ret = ret; + ), + + TP_printk("scanned for %li nodes, can free %li, ret %li", + __entry->nr_to_scan, __entry->can_free, __entry->ret) +); + +DEFINE_EVENT(btree_node, btree_cache_reap, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +/* Btree */ + +DEFINE_EVENT(btree_node, btree_node_read, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_node_write, + TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), + TP_ARGS(b, bytes, sectors), + + TP_STRUCT__entry( + __field(enum btree_node_type, type) + __field(unsigned, bytes ) + __field(unsigned, sectors ) + ), + + TP_fast_assign( + __entry->type = btree_node_type(b); + __entry->bytes = bytes; + __entry->sectors = sectors; + ), + + TP_printk("bkey type %u bytes %u sectors %u", + __entry->type , __entry->bytes, __entry->sectors) +); + +DEFINE_EVENT(btree_node, btree_node_alloc, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_free, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_reserve_get_fail, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + size_t required, + int ret), + TP_ARGS(trans_fn, caller_ip, required, ret), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(size_t, required ) + __array(char, ret, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->required = required; + strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); + ), + + TP_printk("%s %pS required %zu ret %s", + __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->required, + __entry->ret) +); + +DEFINE_EVENT(btree_node, btree_node_compact, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_merge, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_split, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_rewrite, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_set_root, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_path_relock_fail, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned level), + TP_ARGS(trans, caller_ip, path, level), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __array(char, node, 24 ) + __field(u8, self_read_count ) + __field(u8, self_intent_count) + __field(u8, read_count ) + __field(u8, intent_count ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) + ), + + TP_fast_assign( + struct btree *b = btree_path_node(path, level); + struct six_lock_count c; + + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = path->level; + TRACE_BPOS_assign(pos, path->pos); + + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + __entry->self_read_count = c.n[SIX_LOCK_read]; + __entry->self_intent_count = c.n[SIX_LOCK_intent]; + + if (IS_ERR(b)) { + strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); + } else { + c = six_lock_counts(&path->l[level].b->c.lock); + __entry->read_count = c.n[SIX_LOCK_read]; + __entry->intent_count = c.n[SIX_LOCK_intent]; + scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + } + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->level, + __entry->node, + __entry->self_read_count, + __entry->self_intent_count, + __entry->read_count, + __entry->intent_count, + __entry->iter_lock_seq, + __entry->node_lock_seq) +); + +TRACE_EVENT(btree_path_upgrade_fail, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned level), + TP_ARGS(trans, caller_ip, path, level), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __field(u8, locked ) + __field(u8, self_read_count ) + __field(u8, self_intent_count) + __field(u8, read_count ) + __field(u8, intent_count ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) + ), + + TP_fast_assign( + struct six_lock_count c; + + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = level; + TRACE_BPOS_assign(pos, path->pos); + __entry->locked = btree_node_locked(path, level); + + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + __entry->self_read_count = c.n[SIX_LOCK_read]; + __entry->self_intent_count = c.n[SIX_LOCK_intent]; + c = six_lock_counts(&path->l[level].b->c.lock); + __entry->read_count = c.n[SIX_LOCK_read]; + __entry->intent_count = c.n[SIX_LOCK_intent]; + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->level, + __entry->locked, + __entry->self_read_count, + __entry->self_intent_count, + __entry->read_count, + __entry->intent_count, + __entry->iter_lock_seq, + __entry->node_lock_seq) +); + +/* Garbage collection */ + +DEFINE_EVENT(bch_fs, gc_gens_start, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, gc_gens_end, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +/* Allocator */ + +DECLARE_EVENT_CLASS(bucket_alloc, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err), + + TP_STRUCT__entry( + __field(u8, dev ) + __array(char, reserve, 16 ) + __field(u64, bucket ) + __field(u64, free ) + __field(u64, avail ) + __field(u64, copygc_wait_amount ) + __field(s64, copygc_waiting_for ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, nouse ) + __field(bool, nonblocking ) + __field(u64, nocow ) + __array(char, err, 32 ) + ), + + TP_fast_assign( + __entry->dev = ca->dev_idx; + strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->bucket = bucket; + __entry->free = free; + __entry->avail = avail; + __entry->copygc_wait_amount = copygc_wait_amount; + __entry->copygc_waiting_for = copygc_waiting_for; + __entry->seen = s->buckets_seen; + __entry->open = s->skipped_open; + __entry->need_journal_commit = s->skipped_need_journal_commit; + __entry->nouse = s->skipped_nouse; + __entry->nonblocking = nonblocking; + __entry->nocow = s->skipped_nocow; + strscpy(__entry->err, err, sizeof(__entry->err)); + ), + + TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", + __entry->reserve, + __entry->dev, + __entry->bucket, + __entry->free, + __entry->avail, + __entry->copygc_wait_amount, + __entry->copygc_waiting_for, + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->nouse, + __entry->nocow, + __entry->nonblocking, + __entry->err) +); + +DEFINE_EVENT(bucket_alloc, bucket_alloc, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + +DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + +TRACE_EVENT(discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, discarded ) + __array(char, err, 16 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->seen = seen; + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->discarded = discarded; + strscpy(__entry->err, err, sizeof(__entry->err)); + ), + + TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->discarded, + __entry->err) +); + +TRACE_EVENT(bucket_invalidate, + TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), + TP_ARGS(c, dev, bucket, sectors), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, dev_idx ) + __field(u32, sectors ) + __field(u64, bucket ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->dev_idx = dev; + __entry->sectors = sectors; + __entry->bucket = bucket; + ), + + TP_printk("%d:%d invalidated %u:%llu cached sectors %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dev_idx, __entry->bucket, + __entry->sectors) +); + +/* Moving IO */ + +TRACE_EVENT(bucket_evacuate, + TP_PROTO(struct bch_fs *c, struct bpos *bucket), + TP_ARGS(c, bucket), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, dev_idx ) + __field(u64, bucket ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->dev_idx = bucket->inode; + __entry->bucket = bucket->offset; + ), + + TP_printk("%d:%d %u:%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dev_idx, __entry->bucket) +); + +DEFINE_EVENT(bkey, move_extent, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +DEFINE_EVENT(bkey, move_extent_read, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +DEFINE_EVENT(bkey, move_extent_write, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +DEFINE_EVENT(bkey, move_extent_finish, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +TRACE_EVENT(move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *msg), + TP_ARGS(c, msg), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __string(msg, msg ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __assign_str(msg, msg); + ), + + TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) +); + +DEFINE_EVENT(bkey, move_extent_start_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + +TRACE_EVENT(move_data, + TP_PROTO(struct bch_fs *c, + struct bch_move_stats *stats), + TP_ARGS(c, stats), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, keys_moved ) + __field(u64, keys_raced ) + __field(u64, sectors_seen ) + __field(u64, sectors_moved ) + __field(u64, sectors_raced ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->keys_moved = atomic64_read(&stats->keys_moved); + __entry->keys_raced = atomic64_read(&stats->keys_raced); + __entry->sectors_seen = atomic64_read(&stats->sectors_seen); + __entry->sectors_moved = atomic64_read(&stats->sectors_moved); + __entry->sectors_raced = atomic64_read(&stats->sectors_raced); + ), + + TP_printk("%d,%d keys moved %llu raced %llu" + "sectors seen %llu moved %llu raced %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->keys_moved, + __entry->keys_raced, + __entry->sectors_seen, + __entry->sectors_moved, + __entry->sectors_raced) +); + +TRACE_EVENT(evacuate_bucket, + TP_PROTO(struct bch_fs *c, struct bpos *bucket, + unsigned sectors, unsigned bucket_size, + u64 fragmentation, int ret), + TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, member ) + __field(u64, bucket ) + __field(u32, sectors ) + __field(u32, bucket_size ) + __field(u64, fragmentation ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->member = bucket->inode; + __entry->bucket = bucket->offset; + __entry->sectors = sectors; + __entry->bucket_size = bucket_size; + __entry->fragmentation = fragmentation; + __entry->ret = ret; + ), + + TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->member, __entry->bucket, + __entry->sectors, __entry->bucket_size, + __entry->fragmentation, __entry->ret) +); + +TRACE_EVENT(copygc, + TP_PROTO(struct bch_fs *c, + u64 sectors_moved, u64 sectors_not_moved, + u64 buckets_moved, u64 buckets_not_moved), + TP_ARGS(c, + sectors_moved, sectors_not_moved, + buckets_moved, buckets_not_moved), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, sectors_moved ) + __field(u64, sectors_not_moved ) + __field(u64, buckets_moved ) + __field(u64, buckets_not_moved ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->sectors_moved = sectors_moved; + __entry->sectors_not_moved = sectors_not_moved; + __entry->buckets_moved = buckets_moved; + __entry->buckets_not_moved = buckets_moved; + ), + + TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->sectors_moved, __entry->sectors_not_moved, + __entry->buckets_moved, __entry->buckets_not_moved) +); + +TRACE_EVENT(copygc_wait, + TP_PROTO(struct bch_fs *c, + u64 wait_amount, u64 until), + TP_ARGS(c, wait_amount, until), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, wait_amount ) + __field(u64, until ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->wait_amount = wait_amount; + __entry->until = until; + ), + + TP_printk("%d,%u waiting for %llu sectors until %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait_amount, __entry->until) +); + +/* btree transactions: */ + +DECLARE_EVENT_CLASS(transaction_event, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + + TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) +); + +DEFINE_EVENT(transaction_event, transaction_commit, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_injected, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(trans_restart_split_race, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree *b), + TP_ARGS(trans, caller_ip, b), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, level ) + __field(u16, written ) + __field(u16, blocks ) + __field(u16, u64s_remaining ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->level = b->c.level; + __entry->written = b->written; + __entry->blocks = btree_blocks(trans->c); + __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + ), + + TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->level, + __entry->written, __entry->blocks, + __entry->u64s_remaining) +); + +DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(trans_restart_journal_preres_get, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + unsigned flags), + TP_ARGS(trans, caller_ip, flags), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(unsigned, flags ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + + TP_printk("%s %pS %x", __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->flags) +); + +DEFINE_EVENT(transaction_event, trans_restart_fault_inject, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_traverse_all, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DECLARE_EVENT_CLASS(transaction_restart_iter, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos) + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +struct get_locks_fail; + +TRACE_EVENT(trans_restart_upgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned old_locks_want, + unsigned new_locks_want, + struct get_locks_fail *f), + TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, old_locks_want ) + __field(u8, new_locks_want ) + __field(u8, level ) + __field(u32, path_seq ) + __field(u32, node_seq ) + __field(u32, path_alloc_seq ) + __field(u32, downgrade_seq) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = new_locks_want; + __entry->level = f->l; + __entry->path_seq = path->l[f->l].lock_seq; + __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; + __entry->path_alloc_seq = path->alloc_seq; + __entry->downgrade_seq = path->downgrade_seq; + TRACE_BPOS_assign(pos, path->pos) + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_locks_want, + __entry->new_locks_want, + __entry->level, + __entry->path_seq, + __entry->node_seq, + __entry->path_alloc_seq, + __entry->downgrade_seq) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(trans_restart_would_deadlock_write, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + ), + + TP_printk("%s", __entry->trans_fn) +); + +TRACE_EVENT(trans_restart_mem_realloced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + unsigned long bytes), + TP_ARGS(trans, caller_ip, bytes), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(unsigned long, bytes ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; + ), + + TP_printk("%s %pS bytes %lu", + __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->bytes) +); + +TRACE_EVENT(trans_restart_key_cache_key_realloced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned old_u64s, + unsigned new_u64s), + TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(enum btree_id, btree_id ) + TRACE_BPOS_entries(pos) + __field(u32, old_u64s ) + __field(u32, new_u64s ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + __entry->old_u64s = old_u64s; + __entry->new_u64s = new_u64s; + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_u64s, + __entry->new_u64s) +); + +TRACE_EVENT(path_downgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + + TP_printk("%s %pS", + __entry->trans_fn, + (void *) __entry->caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(write_buffer_flush, + TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), + TP_ARGS(trans, nr, skipped, fast, size), + + TP_STRUCT__entry( + __field(size_t, nr ) + __field(size_t, skipped ) + __field(size_t, fast ) + __field(size_t, size ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->skipped = skipped; + __entry->fast = fast; + __entry->size = size; + ), + + TP_printk("%zu/%zu skipped %zu fast %zu", + __entry->nr, __entry->size, __entry->skipped, __entry->fast) +); + +TRACE_EVENT(write_buffer_flush_slowpath, + TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), + TP_ARGS(trans, nr, size), + + TP_STRUCT__entry( + __field(size_t, nr ) + __field(size_t, size ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->size = size; + ), + + TP_printk("%zu/%zu", __entry->nr, __entry->size) +); + +#endif /* _TRACE_BCACHEFS_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../fs/bcachefs + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +#include <trace/define_trace.h> diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c new file mode 100644 index 000000000000..9764c2e6a910 --- /dev/null +++ b/fs/bcachefs/two_state_shared_lock.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "two_state_shared_lock.h" + +void __bch2_two_state_lock(two_state_lock_t *lock, int s) +{ + __wait_event(lock->wait, bch2_two_state_trylock(lock, s)); +} diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h new file mode 100644 index 000000000000..905801772002 --- /dev/null +++ b/fs/bcachefs/two_state_shared_lock.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_TWO_STATE_LOCK_H +#define _BCACHEFS_TWO_STATE_LOCK_H + +#include <linux/atomic.h> +#include <linux/sched.h> +#include <linux/wait.h> + +#include "util.h" + +/* + * Two-state lock - can be taken for add or block - both states are shared, + * like read side of rwsem, but conflict with other state: + */ +typedef struct { + atomic_long_t v; + wait_queue_head_t wait; +} two_state_lock_t; + +static inline void two_state_lock_init(two_state_lock_t *lock) +{ + atomic_long_set(&lock->v, 0); + init_waitqueue_head(&lock->wait); +} + +static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + + EBUG_ON(atomic_long_read(&lock->v) == 0); + + if (atomic_long_sub_return_release(i, &lock->v) == 0) + wake_up_all(&lock->wait); +} + +static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + long v = atomic_long_read(&lock->v), old; + + do { + old = v; + + if (i > 0 ? v < 0 : v > 0) + return false; + } while ((v = atomic_long_cmpxchg_acquire(&lock->v, + old, old + i)) != old); + return true; +} + +void __bch2_two_state_lock(two_state_lock_t *, int); + +static inline void bch2_two_state_lock(two_state_lock_t *lock, int s) +{ + if (!bch2_two_state_trylock(lock, s)) + __bch2_two_state_lock(lock, s); +} + +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 index 000000000000..84b142fcc3df --- /dev/null +++ b/fs/bcachefs/util.c @@ -0,0 +1,1159 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache + * + * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> + * Copyright 2012 Google, Inc. + */ + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/console.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/log2.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/random.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/sched/clock.h> + +#include "eytzinger.h" +#include "mean_and_variance.h" +#include "util.h" + +static const char si_units[] = "?kMGTPEZY"; + +/* string_get_size units: */ +static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" +}; +static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" +}; + +static int parse_u64(const char *cp, u64 *res) +{ + const char *start = cp; + u64 v = 0; + + if (!isdigit(*cp)) + return -EINVAL; + + do { + if (v > U64_MAX / 10) + return -ERANGE; + v *= 10; + if (v > U64_MAX - (*cp - '0')) + return -ERANGE; + v += *cp - '0'; + cp++; + } while (isdigit(*cp)); + + *res = v; + return cp - start; +} + +static int bch2_pow(u64 n, u64 p, u64 *res) +{ + *res = 1; + + while (p--) { + if (*res > div_u64(U64_MAX, n)) + return -ERANGE; + *res *= n; + } + return 0; +} + +static int parse_unit_suffix(const char *cp, u64 *res) +{ + const char *start = cp; + u64 base = 1024; + unsigned u; + int ret; + + if (*cp == ' ') + cp++; + + for (u = 1; u < strlen(si_units); u++) + if (*cp == si_units[u]) { + cp++; + goto got_unit; + } + + for (u = 0; u < ARRAY_SIZE(units_2); u++) + if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { + cp += strlen(units_2[u]); + goto got_unit; + } + + for (u = 0; u < ARRAY_SIZE(units_10); u++) + if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { + cp += strlen(units_10[u]); + base = 1000; + goto got_unit; + } + + *res = 1; + return 0; +got_unit: + ret = bch2_pow(base, u, res); + if (ret) + return ret; + + return cp - start; +} + +#define parse_or_ret(cp, _f) \ +do { \ + int _ret = _f; \ + if (_ret < 0) \ + return _ret; \ + cp += _ret; \ +} while (0) + +static int __bch2_strtou64_h(const char *cp, u64 *res) +{ + const char *start = cp; + u64 v = 0, b, f_n = 0, f_d = 1; + int ret; + + parse_or_ret(cp, parse_u64(cp, &v)); + + if (*cp == '.') { + cp++; + ret = parse_u64(cp, &f_n); + if (ret < 0) + return ret; + cp += ret; + + ret = bch2_pow(10, ret, &f_d); + if (ret) + return ret; + } + + parse_or_ret(cp, parse_unit_suffix(cp, &b)); + + if (v > div_u64(U64_MAX, b)) + return -ERANGE; + v *= b; + + if (f_n > div_u64(U64_MAX, b)) + return -ERANGE; + + f_n = div_u64(f_n * b, f_d); + if (v + f_n < v) + return -ERANGE; + v += f_n; + + *res = v; + return cp - start; +} + +static int __bch2_strtoh(const char *cp, u64 *res, + u64 t_max, bool t_signed) +{ + bool positive = *cp != '-'; + u64 v = 0; + + if (*cp == '+' || *cp == '-') + cp++; + + parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); + + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + + if (positive) { + if (v > t_max) + return -ERANGE; + } else { + if (v && !t_signed) + return -ERANGE; + + if (v > t_max + 1) + return -ERANGE; + v = -v; + } + + *res = v; + return 0; +} + +#define STRTO_H(name, type) \ +int bch2_ ## name ## _h(const char *cp, type *res) \ +{ \ + u64 v = 0; \ + int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ + ANYSINT_MAX(type) != ((type) ~0ULL)); \ + *res = v; \ + return ret; \ +} + +STRTO_H(strtoint, int) +STRTO_H(strtouint, unsigned int) +STRTO_H(strtoll, long long) +STRTO_H(strtoull, unsigned long long) +STRTO_H(strtou64, u64) + +u64 bch2_read_flag_list(char *opt, const char * const list[]) +{ + u64 ret = 0; + char *p, *s, *d = kstrdup(opt, GFP_KERNEL); + + if (!d) + return -ENOMEM; + + s = strim(d); + + while ((p = strsep(&s, ","))) { + int flag = match_string(list, -1, p); + + if (flag < 0) { + ret = -1; + break; + } + + ret |= 1 << flag; + } + + kfree(d); + + return ret; +} + +bool bch2_is_zero(const void *_p, size_t n) +{ + const char *p = _p; + size_t i; + + for (i = 0; i < n; i++) + if (p[i]) + return false; + return true; +} + +void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +{ + while (nr_bits) + prt_char(out, '0' + ((v >> --nr_bits) & 1)); +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines) +{ + const char *p; + + if (!lines) { + printk("%s (null)\n", prefix); + return; + } + + console_lock(); + while (1) { + p = strchrnul(lines, '\n'); + printk("%s%.*s\n", prefix, (int) (p - lines), lines); + if (!*p) + break; + lines = p + 1; + } + console_unlock(); +} + +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) +{ +#ifdef CONFIG_STACKTRACE + unsigned nr_entries = 0; + int ret = 0; + + stack->nr = 0; + ret = darray_make_room(stack, 32); + if (ret) + return ret; + + if (!down_read_trylock(&task->signal->exec_update_lock)) + return -1; + + do { + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0); + } while (nr_entries == stack->size && + !(ret = darray_make_room(stack, stack->size * 2))); + + stack->nr = nr_entries; + up_read(&task->signal->exec_update_lock); + + return ret; +#else + return 0; +#endif +} + +void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) +{ + unsigned long *i; + + darray_for_each(*stack, i) { + prt_printf(out, "[<0>] %pB", (void *) *i); + prt_newline(out); + } +} + +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) +{ + bch_stacktrace stack = { 0 }; + int ret = bch2_save_backtrace(&stack, task); + + bch2_prt_backtrace(out, &stack); + darray_exit(&stack); + return ret; +} + +/* time stats: */ + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct bch2_quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + + if (time_after64(end, start)) { + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + bch2_quantiles_update(&stats->quantiles, duration); + } + + if (time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + stats->last_event = end; + } +} + +static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) +{ + struct bch2_time_stat_buffer_entry *i; + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + for (i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + spin_unlock_irqrestore(&stats->lock, flags); + + b->nr = 0; +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, + "time_stats: min_duration = %llu, min_freq = %llu", + stats->min_duration, stats->min_freq); + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + bch2_time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct bch2_time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct bch2_time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + bch2_time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} +#endif + +static const struct time_unit { + const char *name; + u64 nsecs; +} time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, +}; + +static const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + +static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); + prt_tab_rjust(out); + prt_printf(out, "%s", u->name); +} + +#ifndef __KERNEL__ +#include <time.h> +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + time_t t = sec; + char buf[64]; + ctime_r(&t, buf); + prt_str(out, buf); +} +#else +void bch2_prt_datetime(struct printbuf *out, time64_t sec) +{ + char buf[64]; + snprintf(buf, sizeof(buf), "%ptT", &sec); + prt_u64(out, sec); +} +#endif + +#define TABSTOP_SIZE 12 + +static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) +{ + prt_str(out, name); + prt_tab(out); + bch2_pr_time_units_aligned(out, ns); + prt_newline(out); +} + +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) +{ + const struct time_unit *u; + s64 f_mean = 0, d_mean = 0; + u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; + int i; + /* + * avoid divide by zero + */ + if (stats->freq_stats.n) { + f_mean = mean_and_variance_get_mean(stats->freq_stats); + f_stddev = mean_and_variance_get_stddev(stats->freq_stats); + d_mean = mean_and_variance_get_mean(stats->duration_stats); + d_stddev = mean_and_variance_get_stddev(stats->duration_stats); + } + + printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); + prt_printf(out, "count:"); + prt_tab(out); + prt_printf(out, "%llu ", + stats->duration_stats.n); + printbuf_tabstop_pop(out); + prt_newline(out); + + printbuf_tabstops_reset(out); + + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + printbuf_tabstop_push(out, 0); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + + prt_tab(out); + prt_printf(out, "since mount"); + prt_tab_rjust(out); + prt_tab(out); + prt_printf(out, "recent"); + prt_tab_rjust(out); + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE); + printbuf_tabstop_push(out, 2); + printbuf_tabstop_push(out, TABSTOP_SIZE); + + prt_printf(out, "duration of events"); + prt_newline(out); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_duration); + pr_name_and_units(out, "max:", stats->max_duration); + + prt_printf(out, "mean:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, d_mean); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + prt_newline(out); + + prt_printf(out, "stddev:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, d_stddev); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + prt_printf(out, "time between events"); + prt_newline(out); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_freq); + pr_name_and_units(out, "max:", stats->max_freq); + + prt_printf(out, "mean:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, f_mean); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + prt_newline(out); + + prt_printf(out, "stddev:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, f_stddev); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + printbuf_tabstops_reset(out); + + i = eytzinger0_first(NR_QUANTILES); + u = pick_time_units(stats->quantiles.entries[i].m); + + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + q = max(stats->quantiles.entries[i].m, last_q); + prt_printf(out, "%llu ", + div_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); + last_q = q; + } +} + +void bch2_time_stats_exit(struct bch2_time_stats *stats) +{ + free_percpu(stats->buffer); +} + +void bch2_time_stats_init(struct bch2_time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->duration_stats_weighted.weight = 8; + stats->freq_stats_weighted.weight = 8; + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} + +/* ratelimit: */ + +/** + * bch2_ratelimit_delay() - return how long to delay until the next time to do + * some work + * @d: the struct bch_ratelimit to update + * Returns: the amount of time to delay by, in jiffies + */ +u64 bch2_ratelimit_delay(struct bch_ratelimit *d) +{ + u64 now = local_clock(); + + return time_after64(d->next, now) + ? nsecs_to_jiffies(d->next - now) + : 0; +} + +/** + * bch2_ratelimit_increment() - increment @d by the amount of work done + * @d: the struct bch_ratelimit to update + * @done: the amount of work done, in arbitrary units + */ +void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) +{ + u64 now = local_clock(); + + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; +} + +/* pd controller: */ + +/* + * Updates pd_controller. Attempts to scale inputed values to units per second. + * @target: desired value + * @actual: current value + * + * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing + * it makes actual go down. + */ +void bch2_pd_controller_update(struct bch_pd_controller *pd, + s64 target, s64 actual, int sign) +{ + s64 proportional, derivative, change; + + unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; + + if (seconds_since_update == 0) + return; + + pd->last_update = jiffies; + + proportional = actual - target; + proportional *= seconds_since_update; + proportional = div_s64(proportional, pd->p_term_inverse); + + derivative = actual - pd->last_actual; + derivative = div_s64(derivative, seconds_since_update); + derivative = ewma_add(pd->smoothed_derivative, derivative, + (pd->d_term / seconds_since_update) ?: 1); + derivative = derivative * pd->d_term; + derivative = div_s64(derivative, pd->p_term_inverse); + + change = proportional + derivative; + + /* Don't increase rate if not keeping up */ + if (change > 0 && + pd->backpressure && + time_after64(local_clock(), + pd->rate.next + NSEC_PER_MSEC)) + change = 0; + + change *= (sign * -1); + + pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, + 1, UINT_MAX); + + pd->last_actual = actual; + pd->last_derivative = derivative; + pd->last_proportional = proportional; + pd->last_change = change; + pd->last_target = target; +} + +void bch2_pd_controller_init(struct bch_pd_controller *pd) +{ + pd->rate.rate = 1024; + pd->last_update = jiffies; + pd->p_term_inverse = 6000; + pd->d_term = 30; + pd->d_smooth = pd->d_term; + pd->backpressure = 1; +} + +void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); + + prt_printf(out, "rate:"); + prt_tab(out); + prt_human_readable_s64(out, pd->rate.rate); + prt_newline(out); + + prt_printf(out, "target:"); + prt_tab(out); + prt_human_readable_u64(out, pd->last_target); + prt_newline(out); + + prt_printf(out, "actual:"); + prt_tab(out); + prt_human_readable_u64(out, pd->last_actual); + prt_newline(out); + + prt_printf(out, "proportional:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_proportional); + prt_newline(out); + + prt_printf(out, "derivative:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_derivative); + prt_newline(out); + + prt_printf(out, "change:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_change); + prt_newline(out); + + prt_printf(out, "next io:"); + prt_tab(out); + prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); + prt_newline(out); +} + +/* misc: */ + +void bch2_bio_map(struct bio *bio, void *base, size_t size) +{ + while (size) { + struct page *page = is_vmalloc_addr(base) + ? vmalloc_to_page(base) + : virt_to_page(base); + unsigned offset = offset_in_page(base); + unsigned len = min_t(size_t, PAGE_SIZE - offset, size); + + BUG_ON(!bio_add_page(bio, page, len, offset)); + size -= len; + base += len; + } +} + +int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) +{ + while (size) { + struct page *page = alloc_pages(gfp_mask, 0); + unsigned len = min_t(size_t, PAGE_SIZE, size); + + if (!page) + return -ENOMEM; + + if (unlikely(!bio_add_page(bio, page, len, 0))) { + __free_page(page); + break; + } + + size -= len; + } + + return 0; +} + +size_t bch2_rand_range(size_t max) +{ + size_t rand; + + if (!max) + return 0; + + do { + rand = get_random_long(); + rand &= roundup_pow_of_two(max) - 1; + } while (rand >= max); + + return rand; +} + +void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) +{ + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, dst, iter, dst_iter) { + void *dstp = kmap_local_page(bv.bv_page); + + memcpy(dstp + bv.bv_offset, src, bv.bv_len); + kunmap_local(dstp); + + src += bv.bv_len; + } +} + +void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) +{ + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, src, iter, src_iter) { + void *srcp = kmap_local_page(bv.bv_page); + + memcpy(dst, srcp + bv.bv_offset, bv.bv_len); + kunmap_local(srcp); + + dst += bv.bv_len; + } +} + +static int alignment_ok(const void *base, size_t align) +{ + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + ((unsigned long)base & (align - 1)) == 0; +} + +static void u32_swap(void *a, void *b, size_t size) +{ + u32 t = *(u32 *)a; + *(u32 *)a = *(u32 *)b; + *(u32 *)b = t; +} + +static void u64_swap(void *a, void *b, size_t size) +{ + u64 t = *(u64 *)a; + *(u64 *)a = *(u64 *)b; + *(u64 *)b = t; +} + +static void generic_swap(void *a, void *b, size_t size) +{ + char t; + + do { + t = *(char *)a; + *(char *)a++ = *(char *)b; + *(char *)b++ = t; + } while (--size > 0); +} + +static inline int do_cmp(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + size_t l, size_t r) +{ + return cmp_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +static inline void do_swap(void *base, size_t n, size_t size, + void (*swap_func)(void *, void *, size_t), + size_t l, size_t r) +{ + swap_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)) +{ + int i, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + do_swap(base, n, size, swap_func, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } +} + +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t size)) +{ + /* pre-scale counters for performance */ + int i = (num/2 - 1) * size, n = num * size, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for ( ; i >= 0; i -= size) { + for (r = i; r * 2 + size < n; r = c) { + c = r * 2 + size; + if (c < n - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } + + /* sort */ + for (i = n - size; i > 0; i -= size) { + swap_func(base, base + i, size); + for (r = 0; r * 2 + size < i; r = c) { + c = r * 2 + size; + if (c < i - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } +} + +static void mempool_free_vp(void *element, void *pool_data) +{ + size_t size = (size_t) pool_data; + + vpfree(element, size); +} + +static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t) pool_data; + + return vpmalloc(size, gfp_mask); +} + +int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return size < PAGE_SIZE + ? mempool_init_kmalloc_pool(pool, min_nr, size) + : mempool_init(pool, min_nr, mempool_alloc_vp, + mempool_free_vp, (void *) size); +} + +#if 0 +void eytzinger1_test(void) +{ + unsigned inorder, eytz, size; + + pr_info("1 based eytzinger test:"); + + for (size = 2; + size < 65536; + size++) { + unsigned extra = eytzinger1_extra(size); + + if (!(size % 4096)) + pr_info("tree size %u", size); + + BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); + BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); + + BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); + BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); + + inorder = 1; + eytzinger1_for_each(eytz, size) { + BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); + BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); + BUG_ON(eytz != eytzinger1_last(size) && + eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); + + inorder++; + } + } +} + +void eytzinger0_test(void) +{ + + unsigned inorder, eytz, size; + + pr_info("0 based eytzinger test:"); + + for (size = 1; + size < 65536; + size++) { + unsigned extra = eytzinger0_extra(size); + + if (!(size % 4096)) + pr_info("tree size %u", size); + + BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); + BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); + + BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); + BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); + + inorder = 0; + eytzinger0_for_each(eytz, size) { + BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); + BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); + BUG_ON(eytz != eytzinger0_last(size) && + eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); + + inorder++; + } + } +} + +static inline int cmp_u16(const void *_l, const void *_r, size_t size) +{ + const u16 *l = _l, *r = _r; + + return (*l > *r) - (*r - *l); +} + +static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +{ + int i, c1 = -1, c2 = -1; + ssize_t r; + + r = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) + c1 = test_array[r]; + + for (i = 0; i < nr; i++) + if (test_array[i] <= search && test_array[i] > c2) + c2 = test_array[i]; + + if (c1 != c2) { + eytzinger0_for_each(i, nr) + pr_info("[%3u] = %12u", i, test_array[i]); + pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", + i, r, c1, c2); + } +} + +void eytzinger0_find_test(void) +{ + unsigned i, nr, allocated = 1 << 12; + u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); + + for (nr = 1; nr < allocated; nr++) { + pr_info("testing %u elems", nr); + + get_random_bytes(test_array, nr * sizeof(test_array[0])); + eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); + + /* verify array is sorted correctly: */ + eytzinger0_for_each(i, nr) + BUG_ON(i != eytzinger0_last(nr) && + test_array[i] > test_array[eytzinger0_next(i, nr)]); + + for (i = 0; i < U16_MAX; i += 1 << 12) + eytzinger0_find_test_val(test_array, nr, i); + + for (i = 0; i < nr; i++) { + eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); + eytzinger0_find_test_val(test_array, nr, test_array[i]); + eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); + } + } + + kfree(test_array); +} +#endif + +/* + * Accumulate percpu counters onto one cpu's copy - only valid when access + * against any percpu counter is guarded against + */ +u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) +{ + u64 *ret; + int cpu; + + /* access to pcpu vars has to be blocked by other locking */ + preempt_disable(); + ret = this_cpu_ptr(p); + preempt_enable(); + + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); + + if (i != ret) { + acc_u64s(ret, i, nr); + memset(i, 0, nr * sizeof(u64)); + } + } + + return ret; +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 index 000000000000..2984b57b2958 --- /dev/null +++ b/fs/bcachefs/util.h @@ -0,0 +1,833 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H + +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/closure.h> +#include <linux/errno.h> +#include <linux/freezer.h> +#include <linux/kernel.h> +#include <linux/sched/clock.h> +#include <linux/llist.h> +#include <linux/log2.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/ratelimit.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> + +#include "mean_and_variance.h" + +#include "darray.h" + +struct closure; + +#ifdef CONFIG_BCACHEFS_DEBUG +#define EBUG_ON(cond) BUG_ON(cond) +#else +#define EBUG_ON(cond) +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define CPU_BIG_ENDIAN 0 +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define CPU_BIG_ENDIAN 1 +#endif + +/* type hackery */ + +#define type_is_exact(_val, _type) \ + __builtin_types_compatible_p(typeof(_val), _type) + +#define type_is(_val, _type) \ + (__builtin_types_compatible_p(typeof(_val), _type) || \ + __builtin_types_compatible_p(typeof(_val), const _type)) + +/* Userspace doesn't align allocations as nicely as the kernel allocators: */ +static inline size_t buf_pages(void *p, size_t len) +{ + return DIV_ROUND_UP(len + + ((unsigned long) p & (PAGE_SIZE - 1)), + PAGE_SIZE); +} + +static inline void vpfree(void *p, size_t size) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + free_pages((unsigned long) p, get_order(size)); +} + +static inline void *vpmalloc(size_t size, gfp_t gfp_mask) +{ + return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, + get_order(size)) ?: + __vmalloc(size, gfp_mask); +} + +static inline void kvpfree(void *p, size_t size) +{ + if (size < PAGE_SIZE) + kfree(p); + else + vpfree(p, size); +} + +static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) +{ + return size < PAGE_SIZE + ? kmalloc(size, gfp_mask) + : vpmalloc(size, gfp_mask); +} + +int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); + +#define HEAP(type) \ +struct { \ + size_t size, used; \ + type *data; \ +} + +#define DECLARE_HEAP(type, name) HEAP(type) name + +#define init_heap(heap, _size, gfp) \ +({ \ + (heap)->used = 0; \ + (heap)->size = (_size); \ + (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (gfp)); \ +}) + +#define free_heap(heap) \ +do { \ + kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ + (heap)->data = NULL; \ +} while (0) + +#define heap_set_backpointer(h, i, _fn) \ +do { \ + void (*fn)(typeof(h), size_t) = _fn; \ + if (fn) \ + fn(h, i); \ +} while (0) + +#define heap_swap(h, i, j, set_backpointer) \ +do { \ + swap((h)->data[i], (h)->data[j]); \ + heap_set_backpointer(h, i, set_backpointer); \ + heap_set_backpointer(h, j, set_backpointer); \ +} while (0) + +#define heap_peek(h) \ +({ \ + EBUG_ON(!(h)->used); \ + (h)->data[0]; \ +}) + +#define heap_full(h) ((h)->used == (h)->size) + +#define heap_sift_down(h, i, cmp, set_backpointer) \ +do { \ + size_t _c, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _c) { \ + _c = _j * 2 + 1; \ + if (_c + 1 < (h)->used && \ + cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ + _c++; \ + \ + if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ + break; \ + heap_swap(h, _c, _j, set_backpointer); \ + } \ +} while (0) + +#define heap_sift_up(h, i, cmp, set_backpointer) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ + break; \ + heap_swap(h, i, p, set_backpointer); \ + i = p; \ + } \ +} while (0) + +#define __heap_add(h, d, cmp, set_backpointer) \ +({ \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + heap_set_backpointer(h, _i, set_backpointer); \ + \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + _i; \ +}) + +#define heap_add(h, d, cmp, set_backpointer) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) \ + __heap_add(h, d, cmp, set_backpointer); \ + _r; \ +}) + +#define heap_add_or_replace(h, new, cmp, set_backpointer) \ +do { \ + if (!heap_add(h, new, cmp, set_backpointer) && \ + cmp(h, new, heap_peek(h)) >= 0) { \ + (h)->data[0] = new; \ + heap_set_backpointer(h, 0, set_backpointer); \ + heap_sift_down(h, 0, cmp, set_backpointer); \ + } \ +} while (0) + +#define heap_del(h, i, cmp, set_backpointer) \ +do { \ + size_t _i = (i); \ + \ + BUG_ON(_i >= (h)->used); \ + (h)->used--; \ + if ((_i) < (h)->used) { \ + heap_swap(h, _i, (h)->used, set_backpointer); \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + heap_sift_down(h, _i, cmp, set_backpointer); \ + } \ +} while (0) + +#define heap_pop(h, d, cmp, set_backpointer) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + heap_del(h, 0, cmp, set_backpointer); \ + } \ + _r; \ +}) + +#define heap_resort(heap, cmp, set_backpointer) \ +do { \ + ssize_t _i; \ + for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ + heap_sift_down(heap, _i, cmp, set_backpointer); \ +} while (0) + +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + +#include "printbuf.h" + +#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) +#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) +#define printbuf_str(_buf) bch2_printbuf_str(_buf) +#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) + +#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) +#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) +#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) + +#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) +#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) + +#define prt_newline(_out) bch2_prt_newline(_out) +#define prt_tab(_out) bch2_prt_tab(_out) +#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) + +#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) +#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v)) +#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) +#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) +#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) +#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) +#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) +#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) + +void bch2_pr_time_units(struct printbuf *, u64); +void bch2_prt_datetime(struct printbuf *, time64_t); + +#ifdef __KERNEL__ +static inline void uuid_unparse_lower(u8 *uuid, char *out) +{ + sprintf(out, "%pUb", uuid); +} +#else +#include <uuid/uuid.h> +#endif + +static inline void pr_uuid(struct printbuf *out, u8 *uuid) +{ + char uuid_str[40]; + + uuid_unparse_lower(uuid, uuid_str); + prt_printf(out, "%s", uuid_str); +} + +int bch2_strtoint_h(const char *, int *); +int bch2_strtouint_h(const char *, unsigned int *); +int bch2_strtoll_h(const char *, long long *); +int bch2_strtoull_h(const char *, unsigned long long *); +int bch2_strtou64_h(const char *, u64 *); + +static inline int bch2_strtol_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch2_strtoint_h(cp, (int *) res); +#else + return bch2_strtoll_h(cp, (long long *) res); +#endif +} + +static inline int bch2_strtoul_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch2_strtouint_h(cp, (unsigned int *) res); +#else + return bch2_strtoull_h(cp, (unsigned long long *) res); +#endif +} + +#define strtoi_h(cp, res) \ + ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ + : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ + : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ + : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ + : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ + : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ + : -EINVAL) + +#define strtoul_safe(cp, var) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = _v; \ + _r; \ +}) + +#define strtoul_safe_clamp(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = clamp_t(typeof(var), _v, min, max); \ + _r; \ +}) + +#define strtoul_safe_restrict(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r && _v >= min && _v <= max) \ + var = _v; \ + else \ + _r = -EINVAL; \ + _r; \ +}) + +#define snprint(out, var) \ + prt_printf(out, \ + type_is(var, int) ? "%i\n" \ + : type_is(var, unsigned) ? "%u\n" \ + : type_is(var, long) ? "%li\n" \ + : type_is(var, unsigned long) ? "%lu\n" \ + : type_is(var, s64) ? "%lli\n" \ + : type_is(var, u64) ? "%llu\n" \ + : type_is(var, char *) ? "%s\n" \ + : "%i\n", var) + +bool bch2_is_zero(const void *, size_t); + +u64 bch2_read_flag_list(char *, const char * const[]); + +void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); + +void bch2_print_string_as_lines(const char *prefix, const char *lines); + +typedef DARRAY(unsigned long) bch_stacktrace; +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *); +void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *); + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct bch2_quantiles { + struct bch2_quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct bch2_time_stat_buffer { + unsigned nr; + struct bch2_time_stat_buffer_entry { + u64 start; + u64 end; + } entries[32]; +}; + +struct bch2_time_stats { + spinlock_t lock; + /* all fields are in nanoseconds */ + u64 max_duration; + u64 min_duration; + u64 max_freq; + u64 min_freq; + u64 last_event; + struct bch2_quantiles quantiles; + + struct mean_and_variance duration_stats; + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance freq_stats; + struct mean_and_variance_weighted freq_stats_weighted; + struct bch2_time_stat_buffer __percpu *buffer; +}; + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +#endif + +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) +{ + __bch2_time_stats_update(stats, start, local_clock()); +} + +void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); + +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); + +#define ewma_add(ewma, val, weight) \ +({ \ + typeof(ewma) _ewma = (ewma); \ + typeof(weight) _weight = (weight); \ + \ + (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ +}) + +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ + u64 next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to + * bch2_ratelimit_increment() + */ + unsigned rate; +}; + +static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) +{ + d->next = local_clock(); +} + +u64 bch2_ratelimit_delay(struct bch_ratelimit *); +void bch2_ratelimit_increment(struct bch_ratelimit *, u64); + +struct bch_pd_controller { + struct bch_ratelimit rate; + unsigned long last_update; + + s64 last_actual; + s64 smoothed_derivative; + + unsigned p_term_inverse; + unsigned d_smooth; + unsigned d_term; + + /* for exporting to sysfs (no effect on behavior) */ + s64 last_derivative; + s64 last_proportional; + s64 last_change; + s64 last_target; + + /* + * If true, the rate will not increase if bch2_ratelimit_delay() + * is not being called often enough. + */ + bool backpressure; +}; + +void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); +void bch2_pd_controller_init(struct bch_pd_controller *); +void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); + +#define sysfs_pd_controller_attribute(name) \ + rw_attribute(name##_rate); \ + rw_attribute(name##_rate_bytes); \ + rw_attribute(name##_rate_d_term); \ + rw_attribute(name##_rate_p_term_inverse); \ + read_attribute(name##_rate_debug) + +#define sysfs_pd_controller_files(name) \ + &sysfs_##name##_rate, \ + &sysfs_##name##_rate_bytes, \ + &sysfs_##name##_rate_d_term, \ + &sysfs_##name##_rate_p_term_inverse, \ + &sysfs_##name##_rate_debug + +#define sysfs_pd_controller_show(name, var) \ +do { \ + sysfs_hprint(name##_rate, (var)->rate.rate); \ + sysfs_print(name##_rate_bytes, (var)->rate.rate); \ + sysfs_print(name##_rate_d_term, (var)->d_term); \ + sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ + \ + if (attr == &sysfs_##name##_rate_debug) \ + bch2_pd_controller_debug_to_text(out, var); \ +} while (0) + +#define sysfs_pd_controller_store(name, var) \ +do { \ + sysfs_strtoul_clamp(name##_rate, \ + (var)->rate.rate, 1, UINT_MAX); \ + sysfs_strtoul_clamp(name##_rate_bytes, \ + (var)->rate.rate, 1, UINT_MAX); \ + sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ + sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ + (var)->p_term_inverse, 1, INT_MAX); \ +} while (0) + +#define container_of_or_null(ptr, type, member) \ +({ \ + typeof(ptr) _ptr = ptr; \ + _ptr ? container_of(_ptr, type, member) : NULL; \ +}) + +/* Does linear interpolation between powers of two */ +static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) +{ + unsigned fract = x & ~(~0 << fract_bits); + + x >>= fract_bits; + x = 1 << x; + x += (x * fract) >> fract_bits; + + return x; +} + +void bch2_bio_map(struct bio *bio, void *base, size_t); +int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); + +static inline sector_t bdev_sectors(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> 9; +} + +#define closure_bio_submit(bio, cl) \ +do { \ + closure_get(cl); \ + submit_bio(bio); \ +} while (0) + +#define kthread_wait(cond) \ +({ \ + int _ret = 0; \ + \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (kthread_should_stop()) { \ + _ret = -1; \ + break; \ + } \ + \ + if (cond) \ + break; \ + \ + schedule(); \ + } \ + set_current_state(TASK_RUNNING); \ + _ret; \ +}) + +#define kthread_wait_freezable(cond) \ +({ \ + int _ret = 0; \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (kthread_should_stop()) { \ + _ret = -1; \ + break; \ + } \ + \ + if (cond) \ + break; \ + \ + schedule(); \ + try_to_freeze(); \ + } \ + set_current_state(TASK_RUNNING); \ + _ret; \ +}) + +size_t bch2_rand_range(size_t); + +void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); +void memcpy_from_bio(void *, struct bio *, struct bvec_iter); + +static inline void memcpy_u64s_small(void *dst, const void *src, + unsigned u64s) +{ + u64 *d = dst; + const u64 *s = src; + + while (u64s--) + *d++ = *s++; +} + +static inline void __memcpy_u64s(void *dst, const void *src, + unsigned u64s) +{ +#ifdef CONFIG_X86_64 + long d0, d1, d2; + + asm volatile("rep ; movsq" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (u64s), "1" (dst), "2" (src) + : "memory"); +#else + u64 *d = dst; + const u64 *s = src; + + while (u64s--) + *d++ = *s++; +#endif +} + +static inline void memcpy_u64s(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(!(dst >= src + u64s * sizeof(u64) || + dst + u64s * sizeof(u64) <= src)); + + __memcpy_u64s(dst, src, u64s); +} + +static inline void __memmove_u64s_down(void *dst, const void *src, + unsigned u64s) +{ + __memcpy_u64s(dst, src, u64s); +} + +static inline void memmove_u64s_down(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst > src); + + __memmove_u64s_down(dst, src, u64s); +} + +static inline void __memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + memcpy_u64s_small(dst, src, u64s); +} + +static inline void memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst > src); + + __memmove_u64s_down_small(dst, src, u64s); +} + +static inline void __memmove_u64s_up_small(void *_dst, const void *_src, + unsigned u64s) +{ + u64 *dst = (u64 *) _dst + u64s; + u64 *src = (u64 *) _src + u64s; + + while (u64s--) + *--dst = *--src; +} + +static inline void memmove_u64s_up_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst < src); + + __memmove_u64s_up_small(dst, src, u64s); +} + +static inline void __memmove_u64s_up(void *_dst, const void *_src, + unsigned u64s) +{ + u64 *dst = (u64 *) _dst + u64s - 1; + u64 *src = (u64 *) _src + u64s - 1; + +#ifdef CONFIG_X86_64 + long d0, d1, d2; + + asm volatile("std ;\n" + "rep ; movsq\n" + "cld ;\n" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (u64s), "1" (dst), "2" (src) + : "memory"); +#else + while (u64s--) + *dst-- = *src--; +#endif +} + +static inline void memmove_u64s_up(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst < src); + + __memmove_u64s_up(dst, src, u64s); +} + +static inline void memmove_u64s(void *dst, const void *src, + unsigned u64s) +{ + if (dst < src) + __memmove_u64s_down(dst, src, u64s); + else + __memmove_u64s_up(dst, src, u64s); +} + +/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ +static inline void memset_u64s_tail(void *s, int c, unsigned bytes) +{ + unsigned rem = round_up(bytes, sizeof(u64)) - bytes; + + memset(s + bytes, c, rem); +} + +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); + +/* just the memmove, doesn't update @_nr */ +#define __array_insert_item(_array, _nr, _pos) \ + memmove(&(_array)[(_pos) + 1], \ + &(_array)[(_pos)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))) + +#define array_insert_item(_array, _nr, _pos, _new_item) \ +do { \ + __array_insert_item(_array, _nr, _pos); \ + (_nr)++; \ + (_array)[(_pos)] = (_new_item); \ +} while (0) + +#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ +do { \ + (_nr) -= (_nr_to_remove); \ + memmove(&(_array)[(_pos)], \ + &(_array)[(_pos) + (_nr_to_remove)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))); \ +} while (0) + +#define array_remove_item(_array, _nr, _pos) \ + array_remove_items(_array, _nr, _pos, 1) + +static inline void __move_gap(void *array, size_t element_size, + size_t nr, size_t size, + size_t old_gap, size_t new_gap) +{ + size_t gap_end = old_gap + size - nr; + + if (new_gap < old_gap) { + size_t move = old_gap - new_gap; + + memmove(array + element_size * (gap_end - move), + array + element_size * (old_gap - move), + element_size * move); + } else if (new_gap > old_gap) { + size_t move = new_gap - old_gap; + + memmove(array + element_size * old_gap, + array + element_size * gap_end, + element_size * move); + } +} + +/* Move the gap in a gap buffer: */ +#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ + __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) + +#define bubble_sort(_base, _nr, _cmp) \ +do { \ + ssize_t _i, _last; \ + bool _swapped = true; \ + \ + for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\ + _swapped = false; \ + for (_i = 0; _i < _last; _i++) \ + if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ + swap((_base)[_i], (_base)[_i + 1]); \ + _swapped = true; \ + } \ + } \ +} while (0) + +static inline u64 percpu_u64_get(u64 __percpu *src) +{ + u64 ret = 0; + int cpu; + + for_each_possible_cpu(cpu) + ret += *per_cpu_ptr(src, cpu); + return ret; +} + +static inline void percpu_u64_set(u64 __percpu *dst, u64 src) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(dst, cpu) = 0; + this_cpu_write(*dst, src); +} + +static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) + acc[i] += src[i]; +} + +static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, + unsigned nr) +{ + int cpu; + + for_each_possible_cpu(cpu) + acc_u64s(acc, per_cpu_ptr(src, cpu), nr); +} + +static inline void percpu_memset(void __percpu *p, int c, size_t bytes) +{ + int cpu; + + for_each_possible_cpu(cpu) + memset(per_cpu_ptr(p, cpu), c, bytes); +} + +u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); + +#define cmp_int(l, r) ((l > r) - (l < r)) + +static inline int u8_cmp(u8 l, u8 r) +{ + return cmp_int(l, r); +} + +static inline int cmp_le32(__le32 l, __le32 r) +{ + return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); +} + +#include <linux/uuid.h> + +#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c new file mode 100644 index 000000000000..cb4f33ed9ab3 --- /dev/null +++ b/fs/bcachefs/varint.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bitops.h> +#include <linux/math.h> +#include <linux/string.h> +#include <asm/unaligned.h> + +#ifdef CONFIG_VALGRIND +#include <valgrind/memcheck.h> +#endif + +#include "varint.h" + +/** + * bch2_varint_encode - encode a variable length integer + * @out: destination to encode to + * @v: unsigned integer to encode + * Returns: size in bytes of the encoded integer - at most 9 bytes + */ +int bch2_varint_encode(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + __le64 v_le; + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + v_le = cpu_to_le64(v); + memcpy(out, &v_le, bytes); + } else { + *out++ = 255; + bytes = 9; + put_unaligned_le64(v, out); + } + + return bytes; +} + +/** + * bch2_varint_decode - encode a variable length integer + * @in: varint to decode + * @end: end of buffer to decode from + * @out: on success, decoded integer + * Returns: size in bytes of the decoded integer - or -1 on failure (would + * have read past the end of the buffer) + */ +int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) +{ + unsigned bytes = likely(in < end) + ? ffz(*in & 255) + 1 + : 1; + u64 v; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + __le64 v_le = 0; + + memcpy(&v_le, in, bytes); + v = le64_to_cpu(v_le); + v >>= bytes; + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} + +/** + * bch2_varint_encode_fast - fast version of bch2_varint_encode + * @out: destination to encode to + * @v: unsigned integer to encode + * Returns: size in bytes of the encoded integer - at most 9 bytes + * + * This version assumes it's always safe to write 8 bytes to @out, even if the + * encoded integer would be smaller. + */ +int bch2_varint_encode_fast(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + } else { + *out++ = 255; + bytes = 9; + } + + put_unaligned_le64(v, out); + return bytes; +} + +/** + * bch2_varint_decode_fast - fast version of bch2_varint_decode + * @in: varint to decode + * @end: end of buffer to decode from + * @out: on success, decoded integer + * Returns: size in bytes of the decoded integer - or -1 on failure (would + * have read past the end of the buffer) + * + * This version assumes that it is safe to read at most 8 bytes past the end of + * @end (we still return an error if the varint extends past @end). + */ +int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) +{ +#ifdef CONFIG_VALGRIND + VALGRIND_MAKE_MEM_DEFINED(in, 8); +#endif + u64 v = get_unaligned_le64(in); + unsigned bytes = ffz(*in) + 1; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + v >>= bytes; + v &= ~(~0ULL << (7 * bytes)); + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h new file mode 100644 index 000000000000..92a182fb3d7a --- /dev/null +++ b/fs/bcachefs/varint.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_VARINT_H +#define _BCACHEFS_VARINT_H + +int bch2_varint_encode(u8 *, u64); +int bch2_varint_decode(const u8 *, const u8 *, u64 *); + +int bch2_varint_encode_fast(u8 *, u64); +int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); + +#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h new file mode 100644 index 000000000000..a6561b4b36a6 --- /dev/null +++ b/fs/bcachefs/vstructs.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VSTRUCTS_H +#define _VSTRUCTS_H + +#include "util.h" + +/* + * NOTE: we can't differentiate between __le64 and u64 with type_is - this + * assumes u64 is little endian: + */ +#define __vstruct_u64s(_s) \ +({ \ + ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ + : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ + : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ + : ((__force u8) ((_s)->u64s))); \ +}) + +#define __vstruct_bytes(_type, _u64s) \ +({ \ + BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ + \ + (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ +}) + +#define vstruct_bytes(_s) \ + __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) + +#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ + (round_up(__vstruct_bytes(_type, _u64s), \ + 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) + +#define vstruct_blocks(_s, _sector_block_bits) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) + +#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ + __vstruct_u64s(_s) + (_u64s)) + +#define vstruct_sectors(_s, _sector_block_bits) \ + (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) + +#define vstruct_next(_s) \ + ((typeof(_s)) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) +#define vstruct_last(_s) \ + ((typeof(&(_s)->start[0])) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) +#define vstruct_end(_s) \ + ((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s))) + +#define vstruct_for_each(_s, _i) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s); \ + _i = vstruct_next(_i)) + +#define vstruct_for_each_safe(_s, _i, _t) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ + _i = _t) + +#define vstruct_idx(_s, _idx) \ + ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) + +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 index 000000000000..79d982674c18 --- /dev/null +++ b/fs/bcachefs/xattr.c @@ -0,0 +1,652 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "acl.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "extents.h" +#include "fs.h" +#include "rebalance.h" +#include "str_hash.h" +#include "xattr.h" + +#include <linux/dcache.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); + +static u64 bch2_xattr_hash(const struct bch_hash_info *info, + const struct xattr_search_key *key) +{ + struct bch_str_hash_ctx ctx; + + bch2_str_hash_init(&ctx, info); + bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); + bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); + + return bch2_str_hash_end(&ctx, info); +} + +static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) +{ + return bch2_xattr_hash(info, key); +} + +static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) +{ + struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); + + return bch2_xattr_hash(info, + &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); +} + +static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) +{ + struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); + const struct xattr_search_key *r = _r; + + return l.v->x_type != r->type || + l.v->x_name_len != r->name.len || + memcmp(l.v->x_name, r->name.name, r->name.len); +} + +static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +{ + struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); + struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); + + return l.v->x_type != r.v->x_type || + l.v->x_name_len != r.v->x_name_len || + memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); +} + +const struct bch_hash_desc bch2_xattr_hash_desc = { + .btree_id = BTREE_ID_xattrs, + .key_type = KEY_TYPE_xattr, + .hash_key = xattr_hash_key, + .hash_bkey = xattr_hash_bkey, + .cmp_key = xattr_cmp_key, + .cmp_bkey = xattr_cmp_bkey, +}; + +int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len)); + int ret = 0; + + bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err, + xattr_val_size_too_small, + "value too small (%zu < %u)", + bkey_val_u64s(k.k), val_u64s); + + /* XXX why +4 ? */ + val_u64s = xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4); + + bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err, + xattr_val_size_too_big, + "value too big (%zu > %u)", + bkey_val_u64s(k.k), val_u64s); + + bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err, + xattr_invalid_type, + "invalid type (%u)", xattr.v->x_type); + + bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err, + xattr_name_invalid_chars, + "xattr name has invalid characters"); +fsck_err: + return ret; +} + +void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct xattr_handler *handler; + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + + handler = bch2_xattr_type_to_handler(xattr.v->x_type); + if (handler && handler->prefix) + prt_printf(out, "%s", handler->prefix); + else if (handler) + prt_printf(out, "(type %u)", xattr.v->x_type); + else + prt_printf(out, "(unknown type %u)", xattr.v->x_type); + + prt_printf(out, "%.*s:%.*s", + xattr.v->x_name_len, + xattr.v->x_name, + le16_to_cpu(xattr.v->x_val_len), + (char *) xattr_val(xattr.v)); + + if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || + xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) { + prt_char(out, ' '); + bch2_acl_to_text(out, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + } +} + +static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, + const char *name, void *buffer, size_t size, int type) +{ + struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); + struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); + struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_s_c k; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), &search, 0); + if (ret) + goto err1; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err2; + + xattr = bkey_s_c_to_xattr(k); + ret = le16_to_cpu(xattr.v->x_val_len); + if (buffer) { + if (ret > size) + ret = -ERANGE; + else + memcpy(buffer, xattr_val(xattr.v), ret); + } +err2: + bch2_trans_iter_exit(trans, &iter); +err1: + return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; +} + +int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, + int type, int flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter inode_iter = { NULL }; + int ret; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + return ret; + + inode_u->bi_ctime = bch2_current_time(c); + + ret = bch2_inode_write(trans, &inode_iter, inode_u); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; + + if (value) { + struct bkey_i_xattr *xattr; + unsigned namelen = strlen(name); + unsigned u64s = BKEY_U64s + + xattr_val_u64s(namelen, size); + + if (u64s > U8_MAX) + return -ERANGE; + + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); + + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = type; + xattr->v.x_name_len = namelen; + xattr->v.x_val_len = cpu_to_le16(size); + memcpy(xattr->v.x_name, name, namelen); + memcpy(xattr_val(&xattr->v), value, size); + + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, + inum, &xattr->k_i, + (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| + (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + } else { + struct xattr_search_key search = + X_SEARCH(type, name, strlen(name)); + + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, + hash_info, inum, &search); + } + + if (bch2_err_matches(ret, ENOENT)) + ret = flags & XATTR_REPLACE ? -ENODATA : 0; + + return ret; +} + +struct xattr_buf { + char *buf; + size_t len; + size_t used; +}; + +static int __bch2_xattr_emit(const char *prefix, + const char *name, size_t name_len, + struct xattr_buf *buf) +{ + const size_t prefix_len = strlen(prefix); + const size_t total_len = prefix_len + name_len + 1; + + if (buf->buf) { + if (buf->used + total_len > buf->len) + return -ERANGE; + + memcpy(buf->buf + buf->used, prefix, prefix_len); + memcpy(buf->buf + buf->used + prefix_len, + name, name_len); + buf->buf[buf->used + prefix_len + name_len] = '\0'; + } + + buf->used += total_len; + return 0; +} + +static int bch2_xattr_emit(struct dentry *dentry, + const struct bch_xattr *xattr, + struct xattr_buf *buf) +{ + const struct xattr_handler *handler = + bch2_xattr_type_to_handler(xattr->x_type); + + return handler && (!handler->list || handler->list(dentry)) + ? __bch2_xattr_emit(handler->prefix ?: handler->name, + xattr->x_name, xattr->x_name_len, buf) + : 0; +} + +static int bch2_xattr_list_bcachefs(struct bch_fs *c, + struct bch_inode_unpacked *inode, + struct xattr_buf *buf, + bool all) +{ + const char *prefix = all ? "bcachefs_effective." : "bcachefs."; + unsigned id; + int ret = 0; + u64 v; + + for (id = 0; id < Inode_opt_nr; id++) { + v = bch2_inode_opt_get(inode, id); + if (!v) + continue; + + if (!all && + !(inode->bi_fields_set & (1 << id))) + continue; + + ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], + strlen(bch2_inode_opts[id]), buf); + if (ret) + break; + } + + return ret; +} + +ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct bch_fs *c = dentry->d_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; + u64 offset = 0, inum = inode->ei_inode.bi_inum; + u32 snapshot; + int ret; +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs, + SPOS(inum, offset, snapshot), + POS(inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_xattr) + continue; + + ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); + if (ret) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_put(trans); + + if (ret) + goto out; + + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); + if (ret) + goto out; + + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); + if (ret) + goto out; + + return buf.used; +out: + return bch2_err_class(ret); +} + +static int bch2_xattr_get_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); + + return bch2_err_class(ret); +} + +static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct bch_inode_unpacked inode_u; + int ret; + + ret = bch2_trans_run(c, + commit_do(trans, NULL, NULL, 0, + bch2_xattr_set(trans, inode_inum(inode), &inode_u, + &hash, name, value, size, + handler->flags, flags)) ?: + (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); + + return bch2_err_class(ret); +} + +static const struct xattr_handler bch_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = KEY_TYPE_XATTR_INDEX_USER, +}; + +static bool bch2_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static const struct xattr_handler bch_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = bch2_xattr_trusted_list, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, +}; + +static const struct xattr_handler bch_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = KEY_TYPE_XATTR_INDEX_SECURITY, +}; + +#ifndef NO_BCACHEFS_FS + +static int opt_to_inode_opt(int id) +{ + switch (id) { +#define x(name, ...) \ + case Opt_##name: return Inode_opt_##name; + BCH_INODE_OPTS() +#undef x + default: + return -1; + } +} + +static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size, + bool all) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_opts opts = + bch2_inode_opts_to_opts(&inode->ei_inode); + const struct bch_option *opt; + int id, inode_opt_id; + struct printbuf out = PRINTBUF; + int ret; + u64 v; + + id = bch2_opt_lookup(name); + if (id < 0 || !bch2_opt_is_inode_opt(id)) + return -EINVAL; + + inode_opt_id = opt_to_inode_opt(id); + if (inode_opt_id < 0) + return -EINVAL; + + opt = bch2_opt_table + id; + + if (!bch2_opt_defined_by_id(&opts, id)) + return -ENODATA; + + if (!all && + !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) + return -ENODATA; + + v = bch2_opt_get_by_id(&opts, id); + bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); + + ret = out.pos; + + if (out.allocation_failure) { + ret = -ENOMEM; + } else if (buffer) { + if (out.pos > size) + ret = -ERANGE; + else + memcpy(buffer, out.buf, out.pos); + } + + printbuf_exit(&out); + return ret; +} + +static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + return __bch2_xattr_bcachefs_get(handler, dentry, vinode, + name, buffer, size, false); +} + +struct inode_opt_set { + int id; + u64 v; + bool defined; +}; + +static int inode_opt_set_fn(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct inode_opt_set *s = p; + + if (s->defined) + bi->bi_fields_set |= 1U << s->id; + else + bi->bi_fields_set &= ~(1U << s->id); + + bch2_inode_opt_set(bi, s->id, s->v); + + return 0; +} + +static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + const struct bch_option *opt; + char *buf; + struct inode_opt_set s; + int opt_id, inode_opt_id, ret; + + opt_id = bch2_opt_lookup(name); + if (opt_id < 0) + return -EINVAL; + + opt = bch2_opt_table + opt_id; + + inode_opt_id = opt_to_inode_opt(opt_id); + if (inode_opt_id < 0) + return -EINVAL; + + s.id = inode_opt_id; + + if (value) { + u64 v = 0; + + buf = kmalloc(size + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, value, size); + buf[size] = '\0'; + + ret = bch2_opt_parse(c, opt, buf, &v, NULL); + kfree(buf); + + if (ret < 0) + return ret; + + ret = bch2_opt_check_may_set(c, opt_id, v); + if (ret < 0) + return ret; + + s.v = v + 1; + s.defined = true; + } else { + /* + * Check if this option was set on the parent - if so, switched + * back to inheriting from the parent: + * + * rename() also has to deal with keeping inherited options up + * to date - see bch2_reinherit_attrs() + */ + spin_lock(&dentry->d_lock); + if (!IS_ROOT(dentry)) { + struct bch_inode_info *dir = + to_bch_ei(d_inode(dentry->d_parent)); + + s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); + } else { + s.v = 0; + } + spin_unlock(&dentry->d_lock); + + s.defined = false; + } + + mutex_lock(&inode->ei_update_lock); + if (inode_opt_id == Inode_opt_project) { + /* + * inode fields accessible via the xattr interface are stored + * with a +1 bias, so that 0 means unset: + */ + ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); + if (ret) + goto err; + } + + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); +err: + mutex_unlock(&inode->ei_update_lock); + + if (value && + (opt_id == Opt_background_compression || + opt_id == Opt_background_target)) + bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); + + return bch2_err_class(ret); +} + +static const struct xattr_handler bch_xattr_bcachefs_handler = { + .prefix = "bcachefs.", + .get = bch2_xattr_bcachefs_get, + .set = bch2_xattr_bcachefs_set, +}; + +static int bch2_xattr_bcachefs_get_effective( + const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + return __bch2_xattr_bcachefs_get(handler, dentry, vinode, + name, buffer, size, true); +} + +static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { + .prefix = "bcachefs_effective.", + .get = bch2_xattr_bcachefs_get_effective, + .set = bch2_xattr_bcachefs_set, +}; + +#endif /* NO_BCACHEFS_FS */ + +const struct xattr_handler *bch2_xattr_handlers[] = { + &bch_xattr_user_handler, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + &nop_posix_acl_access, + &nop_posix_acl_default, +#endif + &bch_xattr_trusted_handler, + &bch_xattr_security_handler, +#ifndef NO_BCACHEFS_FS + &bch_xattr_bcachefs_handler, + &bch_xattr_bcachefs_effective_handler, +#endif + NULL +}; + +static const struct xattr_handler *bch_xattr_handler_map[] = { + [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, + [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = + &nop_posix_acl_access, + [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &nop_posix_acl_default, + [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, + [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, +}; + +static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) +{ + return type < ARRAY_SIZE(bch_xattr_handler_map) + ? bch_xattr_handler_map[type] + : NULL; +} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h new file mode 100644 index 000000000000..1337f31a5c49 --- /dev/null +++ b/fs/bcachefs/xattr.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_H +#define _BCACHEFS_XATTR_H + +#include "str_hash.h" + +extern const struct bch_hash_desc bch2_xattr_hash_desc; + +int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ + .key_invalid = bch2_xattr_invalid, \ + .val_to_text = bch2_xattr_to_text, \ + .min_val_size = 8, \ +}) + +static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) +{ + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + name_len + val_len, sizeof(u64)); +} + +#define xattr_val(_xattr) \ + ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + +struct xattr_search_key { + u8 type; + struct qstr name; +}; + +#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ + { .type = _type, .name = QSTR_INIT(_name, _len) }) + +struct dentry; +struct xattr_handler; +struct bch_hash_info; +struct bch_inode_info; + +/* Exported for cmd_migrate.c in tools: */ +int bch2_xattr_set(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, const struct bch_hash_info *, + const char *, const void *, size_t, int, int); + +ssize_t bch2_xattr_list(struct dentry *, char *, size_t); + +extern const struct xattr_handler *bch2_xattr_handlers[]; + +#endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig index 9550b6462b81..5fcfc4024ffe 100644 --- a/fs/befs/Kconfig +++ b/fs/befs/Kconfig @@ -2,6 +2,7 @@ config BEFS_FS tristate "BeOS file system (BeFS) support (read only)" depends on BLOCK + select BUFFER_HEAD select NLS help The BeOS File System (BeFS) is the native file system of Be, Inc's diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index eee9237386e2..a93d76df8ed8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -96,6 +96,7 @@ static const struct address_space_operations befs_symlink_aops = { }; static const struct export_operations befs_export_operations = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = befs_fh_to_dentry, .fh_to_parent = befs_fh_to_parent, .get_parent = befs_get_parent, @@ -360,11 +361,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) * for indexing purposes. (PFD, page 54) */ - inode->i_mtime.tv_sec = - fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16; - inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ - inode->i_ctime = inode->i_mtime; - inode->i_atime = inode->i_mtime; + inode_set_mtime(inode, + fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16, + 0);/* lower 16 bits are not a time */ + inode_set_ctime_to_ts(inode, inode_get_mtime(inode)); + inode_set_atime_to_ts(inode, inode_get_mtime(inode)); befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num); befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent); diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3a757805b585..8e7ef866b62a 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -2,6 +2,7 @@ config BFS_FS tristate "BFS file system support" depends on BLOCK + select BUFFER_HEAD help Boot File System (BFS) is a file system used under SCO UnixWare to allow the bootloader access to the kernel image and other important diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 040d5140e426..fbc4ae80a4b2 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; @@ -158,7 +158,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, return err; } inc_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); ihold(inode); d_instantiate(new, inode); @@ -187,9 +187,9 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) } de->ino = 0; mark_buffer_dirty_inode(bh, dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); error = 0; @@ -240,10 +240,10 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto end_rename; } old_de->ino = 0; - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); if (new_inode) { - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); } mark_buffer_dirty_inode(old_bh, old_dir); @@ -292,9 +292,10 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) pos = (block - sblock) * BFS_BSIZE + off; if (pos >= dir->i_size) { dir->i_size += BFS_DIRENT_SIZE; - dir->i_ctime = current_time(dir); + inode_set_ctime_current(dir); } - dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1926bec2c850..355957dbce39 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -80,12 +80,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); - inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); - inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); - inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; + inode_set_atime(inode, le32_to_cpu(di->i_atime), 0); + inode_set_mtime(inode, le32_to_cpu(di->i_mtime), 0); + inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0); brelse(bh); unlock_new_inode(inode); @@ -141,9 +138,9 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_uid = cpu_to_le32(i_uid_read(inode)); di->i_gid = cpu_to_le32(i_gid_read(inode)); di->i_nlink = cpu_to_le32(inode->i_nlink); - di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + di->i_atime = cpu_to_le32(inode_get_atime_sec(inode)); + di->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode)); + di->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode)); i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7b3d2d491407..5397b552fbeb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -110,38 +110,19 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) -static int set_brk(unsigned long start, unsigned long end, int prot) -{ - start = ELF_PAGEALIGN(start); - end = ELF_PAGEALIGN(end); - if (end > start) { - /* - * Map the last of the bss segment. - * If the header is requesting these pages to be - * executable, honour that (ppc32 needs this). - */ - int error = vm_brk_flags(start, end - start, - prot & PROT_EXEC ? VM_EXEC : 0); - if (error) - return error; - } - current->mm->start_brk = current->mm->brk = end; - return 0; -} - -/* We need to explicitly zero any fractional pages - after the data section (i.e. bss). This would - contain the junk from the file that should not - be in memory +/* + * We need to explicitly zero any trailing portion of the page that follows + * p_filesz when it ends before the page ends (e.g. bss), otherwise this + * memory will contain the junk from the file that should not be present. */ -static int padzero(unsigned long elf_bss) +static int padzero(unsigned long address) { unsigned long nbyte; - nbyte = ELF_PAGEOFFSET(elf_bss); + nbyte = ELF_PAGEOFFSET(address); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; - if (clear_user((void __user *) elf_bss, nbyte)) + if (clear_user((void __user *)address, nbyte)) return -EFAULT; } return 0; @@ -367,6 +348,11 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, return 0; } +/* + * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" + * into memory at "addr". (Note that p_filesz is rounded up to the + * next page, so any extra bytes from the file must be wiped.) + */ static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) @@ -406,6 +392,60 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } +/* + * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" + * into memory at "addr". Memory from "p_filesz" through "p_memsz" + * rounded up to the next page is zeroed. + */ +static unsigned long elf_load(struct file *filep, unsigned long addr, + const struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) +{ + unsigned long zero_start, zero_end; + unsigned long map_addr; + + if (eppnt->p_filesz) { + map_addr = elf_map(filep, addr, eppnt, prot, type, total_size); + if (BAD_ADDR(map_addr)) + return map_addr; + if (eppnt->p_memsz > eppnt->p_filesz) { + zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_filesz; + zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + + /* + * Zero the end of the last mapped page but ignore + * any errors if the segment isn't writable. + */ + if (padzero(zero_start) && (prot & PROT_WRITE)) + return -EFAULT; + } + } else { + map_addr = zero_start = ELF_PAGESTART(addr); + zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + } + if (eppnt->p_memsz > eppnt->p_filesz) { + /* + * Map the last of the segment. + * If the header is requesting these pages to be + * executable, honour that (ppc32 needs this). + */ + int error; + + zero_start = ELF_PAGEALIGN(zero_start); + zero_end = ELF_PAGEALIGN(zero_end); + + error = vm_brk_flags(zero_start, zero_end - zero_start, + prot & PROT_EXEC ? VM_EXEC : 0); + if (error) + map_addr = error; + } + return map_addr; +} + + static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; @@ -596,8 +636,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct elf_phdr *eppnt; unsigned long load_addr = 0; int load_addr_set = 0; - unsigned long last_bss = 0, elf_bss = 0; - int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -634,7 +672,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; - map_addr = elf_map(interpreter, load_addr + vaddr, + map_addr = elf_load(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type, total_size); total_size = 0; error = map_addr; @@ -660,51 +698,9 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, error = -ENOMEM; goto out; } - - /* - * Find the end of the file mapping for this phdr, and - * keep track of the largest address we see for this. - */ - k = load_addr + eppnt->p_vaddr + eppnt->p_filesz; - if (k > elf_bss) - elf_bss = k; - - /* - * Do the same thing for the memory mapping - between - * elf_bss and last_bss is the bss section. - */ - k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; - if (k > last_bss) { - last_bss = k; - bss_prot = elf_prot; - } } } - /* - * Now fill out the bss section: first pad the last page from - * the file up to the page boundary, and zero it from elf_bss - * up to the end of the page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out; - } - /* - * Next, align both the file and mem bss up to the page size, - * since this is where elf_bss was just zeroed up to, and where - * last_bss will end after the vm_brk_flags() below. - */ - elf_bss = ELF_PAGEALIGN(elf_bss); - last_bss = ELF_PAGEALIGN(last_bss); - /* Finally, if there is still more bss to allocate, do it. */ - if (last_bss > elf_bss) { - error = vm_brk_flags(elf_bss, last_bss - elf_bss, - bss_prot & PROT_EXEC ? VM_EXEC : 0); - if (error) - goto out; - } - error = load_addr; out: return error; @@ -828,8 +824,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; - unsigned long elf_bss, elf_brk; - int bss_prot = 0; + unsigned long elf_brk; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1020,7 +1015,6 @@ out_free_interp: if (retval < 0) goto out_free_dentry; - elf_bss = 0; elf_brk = 0; start_code = ~0UL; @@ -1040,33 +1034,6 @@ out_free_interp: if (elf_ppnt->p_type != PT_LOAD) continue; - if (unlikely (elf_brk > elf_bss)) { - unsigned long nbyte; - - /* There was a PT_LOAD segment with p_memsz > p_filesz - before this one. Map anonymous pages, if needed, - and clear the area. */ - retval = set_brk(elf_bss + load_bias, - elf_brk + load_bias, - bss_prot); - if (retval) - goto out_free_dentry; - nbyte = ELF_PAGEOFFSET(elf_bss); - if (nbyte) { - nbyte = ELF_MIN_ALIGN - nbyte; - if (nbyte > elf_brk - elf_bss) - nbyte = elf_brk - elf_bss; - if (clear_user((void __user *)elf_bss + - load_bias, nbyte)) { - /* - * This bss-zeroing can fail if the ELF - * file specifies odd protections. So - * we don't check the return value - */ - } - } - } - elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); @@ -1162,7 +1129,7 @@ out_free_interp: } } - error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, + error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR_VALUE(error) ? @@ -1210,40 +1177,24 @@ out_free_interp: k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; - if (k > elf_bss) - elf_bss = k; if ((elf_ppnt->p_flags & PF_X) && end_code < k) end_code = k; if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; - if (k > elf_brk) { - bss_prot = elf_prot; + if (k > elf_brk) elf_brk = k; - } } e_entry = elf_ex->e_entry + load_bias; phdr_addr += load_bias; - elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; start_data += load_bias; end_data += load_bias; - /* Calling set_brk effectively mmaps the pages that we need - * for the bss and break sections. We must do this before - * mapping in the interpreter, to make sure it doesn't wind - * up getting placed where the bss needs to go. - */ - retval = set_brk(elf_bss, elf_brk, bss_prot); - if (retval) - goto out_free_dentry; - if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { - retval = -EFAULT; /* Nobody gets to see this, but.. */ - goto out_free_dentry; - } + current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, @@ -1369,7 +1320,6 @@ static int load_elf_library(struct file *file) { struct elf_phdr *elf_phdata; struct elf_phdr *eppnt; - unsigned long elf_bss, bss, len; int retval, error, i, j; struct elfhdr elf_ex; @@ -1414,30 +1364,15 @@ static int load_elf_library(struct file *file) eppnt++; /* Now use mmap to map the library into memory. */ - error = vm_mmap(file, - ELF_PAGESTART(eppnt->p_vaddr), - (eppnt->p_filesz + - ELF_PAGEOFFSET(eppnt->p_vaddr)), + error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr), + eppnt, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED_NOREPLACE | MAP_PRIVATE, - (eppnt->p_offset - - ELF_PAGEOFFSET(eppnt->p_vaddr))); + 0); + if (error != ELF_PAGESTART(eppnt->p_vaddr)) goto out_free_ph; - elf_bss = eppnt->p_vaddr + eppnt->p_filesz; - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_free_ph; - } - - len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr); - bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr); - if (bss > len) { - error = vm_brk(len, bss - len); - if (error) - goto out_free_ph; - } error = 0; out_free_ph: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 1c6c5832af86..fefc642541cb 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -138,7 +138,7 @@ static int is_constdisp(struct elfhdr *hdr) static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file) { - struct elf32_phdr *phdr; + struct elf_phdr *phdr; unsigned long size; int retval, loop; loff_t pos = params->hdr.e_phoff; @@ -345,10 +345,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) /* there's now no turning back... the old userspace image is dead, * defunct, deceased, etc. */ + SET_PERSONALITY(exec_params.hdr); if (elf_check_fdpic(&exec_params.hdr)) - set_personality(PER_LINUX_FDPIC); - else - set_personality(PER_LINUX); + current->personality |= PER_LINUX_FDPIC; if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -560,8 +559,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp &= ~7UL; /* stack the load map(s) */ - len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs; + len = sizeof(struct elf_fdpic_loadmap); + len += sizeof(struct elf_fdpic_loadseg) * exec_params->loadmap->nsegs; sp = (sp - len) & ~7UL; exec_params->map_addr = sp; @@ -571,8 +570,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, current->mm->context.exec_fdpic_loadmap = (unsigned long) sp; if (interp_params->loadmap) { - len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * + len = sizeof(struct elf_fdpic_loadmap); + len += sizeof(struct elf_fdpic_loadseg) * interp_params->loadmap->nsegs; sp = (sp - len) & ~7UL; interp_params->map_addr = sp; @@ -740,13 +739,13 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, struct mm_struct *mm, const char *what) { - struct elf32_fdpic_loadmap *loadmap; + struct elf_fdpic_loadmap *loadmap; #ifdef CONFIG_MMU - struct elf32_fdpic_loadseg *mseg; + struct elf_fdpic_loadseg *mseg; unsigned long load_addr; #endif - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned nloads, tmp; unsigned long stop; int loop, ret; @@ -766,7 +765,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, params->loadmap = loadmap; - loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; + loadmap->version = ELF_FDPIC_LOADMAP_VERSION; loadmap->nsegs = nloads; /* map the requested LOADs into the memory space */ @@ -839,8 +838,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (phdr->p_vaddr >= seg->p_vaddr && phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz) { - Elf32_Dyn __user *dyn; - Elf32_Sword d_tag; + Elf_Dyn __user *dyn; + Elf_Sword d_tag; params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + @@ -850,11 +849,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, * one item, and that the last item is a NULL * entry */ if (phdr->p_memsz == 0 || - phdr->p_memsz % sizeof(Elf32_Dyn) != 0) + phdr->p_memsz % sizeof(Elf_Dyn) != 0) goto dynamic_error; - tmp = phdr->p_memsz / sizeof(Elf32_Dyn); - dyn = (Elf32_Dyn __user *)params->dynamic_addr; + tmp = phdr->p_memsz / sizeof(Elf_Dyn); + dyn = (Elf_Dyn __user *)params->dynamic_addr; if (get_user(d_tag, &dyn[tmp - 1].d_tag) || d_tag != 0) goto dynamic_error; @@ -900,10 +899,12 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, kdebug("- DYNAMIC[]: %lx", params->dynamic_addr); seg = loadmap->segs; for (loop = 0; loop < loadmap->nsegs; loop++, seg++) - kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]", + kdebug("- LOAD[%d] : %08llx-%08llx [va=%llx ms=%llx]", loop, - seg->addr, seg->addr + seg->p_memsz - 1, - seg->p_vaddr, seg->p_memsz); + (unsigned long long) seg->addr, + (unsigned long long) seg->addr + seg->p_memsz - 1, + (unsigned long long) seg->p_vaddr, + (unsigned long long) seg->p_memsz); return 0; @@ -923,8 +924,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( struct file *file, struct mm_struct *mm) { - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0; int loop, ret; @@ -1007,8 +1008,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct file *file, struct mm_struct *mm) { - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned long load_addr, delta_vaddr; int loop, dvset; @@ -1082,9 +1083,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp); - kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx", - loop, phdr->p_memsz + disp, prot, flags, - phdr->p_offset - disp, maddr); + kdebug("mmap[%d] <file> sz=%llx pr=%x fl=%x of=%llx --> %08lx", + loop, (unsigned long long) phdr->p_memsz + disp, + prot, flags, (unsigned long long) phdr->p_offset - disp, + maddr); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1146,8 +1148,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, #else if (excess > 0) { - kdebug("clear[%d] ad=%lx sz=%lx", - loop, maddr + phdr->p_filesz, excess); + kdebug("clear[%d] ad=%llx sz=%lx", loop, + (unsigned long long) maddr + phdr->p_filesz, + excess); if (clear_user((void *) maddr + phdr->p_filesz, excess)) return -EFAULT; } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bb202ad369d5..68fa225f89e5 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -40,9 +40,6 @@ enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ }; -static LIST_HEAD(entries); -static int enabled = 1; - enum {Enabled, Magic}; #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31) #define MISC_FMT_OPEN_BINARY (1UL << 30) @@ -60,12 +57,10 @@ typedef struct { char *name; struct dentry *dentry; struct file *interp_file; + refcount_t users; /* sync removal with load_misc_binary() */ } Node; -static DEFINE_RWLOCK(entries_lock); static struct file_system_type bm_fs_type; -static struct vfsmount *bm_mnt; -static int entry_count; /* * Max length of the register string. Determined by: @@ -82,19 +77,24 @@ static int entry_count; */ #define MAX_REGISTER_LENGTH 1920 -/* - * Check if we support the binfmt - * if we do, return the node, else NULL - * locking is done in load_misc_binary +/** + * search_binfmt_handler - search for a binary handler for @bprm + * @misc: handle to binfmt_misc instance + * @bprm: binary for which we are looking for a handler + * + * Search for a binary type handler for @bprm in the list of registered binary + * type handlers. + * + * Return: binary type list entry on success, NULL on failure */ -static Node *check_file(struct linux_binprm *bprm) +static Node *search_binfmt_handler(struct binfmt_misc *misc, + struct linux_binprm *bprm) { char *p = strrchr(bprm->interp, '.'); - struct list_head *l; + Node *e; /* Walk all the registered handlers. */ - list_for_each(l, &entries) { - Node *e = list_entry(l, Node, list); + list_for_each_entry(e, &misc->entries, list) { char *s; int j; @@ -123,9 +123,79 @@ static Node *check_file(struct linux_binprm *bprm) if (j == e->size) return e; } + return NULL; } +/** + * get_binfmt_handler - try to find a binary type handler + * @misc: handle to binfmt_misc instance + * @bprm: binary for which we are looking for a handler + * + * Try to find a binfmt handler for the binary type. If one is found take a + * reference to protect against removal via bm_{entry,status}_write(). + * + * Return: binary type list entry on success, NULL on failure + */ +static Node *get_binfmt_handler(struct binfmt_misc *misc, + struct linux_binprm *bprm) +{ + Node *e; + + read_lock(&misc->entries_lock); + e = search_binfmt_handler(misc, bprm); + if (e) + refcount_inc(&e->users); + read_unlock(&misc->entries_lock); + return e; +} + +/** + * put_binfmt_handler - put binary handler node + * @e: node to put + * + * Free node syncing with load_misc_binary() and defer final free to + * load_misc_binary() in case it is using the binary type handler we were + * requested to remove. + */ +static void put_binfmt_handler(Node *e) +{ + if (refcount_dec_and_test(&e->users)) { + if (e->flags & MISC_FMT_OPEN_FILE) + filp_close(e->interp_file, NULL); + kfree(e); + } +} + +/** + * load_binfmt_misc - load the binfmt_misc of the caller's user namespace + * + * To be called in load_misc_binary() to load the relevant struct binfmt_misc. + * If a user namespace doesn't have its own binfmt_misc mount it can make use + * of its ancestor's binfmt_misc handlers. This mimicks the behavior of + * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where + * available to all user and user namespaces on the system. + * + * Return: the binfmt_misc instance of the caller's user namespace + */ +static struct binfmt_misc *load_binfmt_misc(void) +{ + const struct user_namespace *user_ns; + struct binfmt_misc *misc; + + user_ns = current_user_ns(); + while (user_ns) { + /* Pairs with smp_store_release() in bm_fill_super(). */ + misc = smp_load_acquire(&user_ns->binfmt_misc); + if (misc) + return misc; + + user_ns = user_ns->parent; + } + + return &init_binfmt_misc; +} + /* * the loader itself */ @@ -133,18 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; - int retval; + int retval = -ENOEXEC; + struct binfmt_misc *misc; - retval = -ENOEXEC; - if (!enabled) + misc = load_binfmt_misc(); + if (!misc->enabled) return retval; - /* to keep locking time low, we copy the interpreter string */ - read_lock(&entries_lock); - fmt = check_file(bprm); - if (fmt) - dget(fmt->dentry); - read_unlock(&entries_lock); + fmt = get_binfmt_handler(misc, bprm); if (!fmt) return retval; @@ -198,7 +264,16 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = 0; ret: - dput(fmt->dentry); + + /* + * If we actually put the node here all concurrent calls to + * load_misc_binary() will have finished. We also know + * that for the refcount to be zero someone must have concurently + * removed the binary type handler from the list and it's our job to + * free it. + */ + put_binfmt_handler(fmt); + return retval; } @@ -287,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count) err = -ENOMEM; memsize = sizeof(Node) + count + 8; - e = kmalloc(memsize, GFP_KERNEL); + e = kmalloc(memsize, GFP_KERNEL_ACCOUNT); if (!e) goto out; @@ -399,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count) if (e->mask) { int i; - char *masked = kmalloc(e->size, GFP_KERNEL); + char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT); print_hex_dump_bytes( KBUILD_MODNAME ": register: mask[decoded]: ", @@ -547,36 +622,114 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); } return inode; } +/** + * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode + * @inode: inode of the relevant binfmt_misc instance + * + * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can + * be done without any memory barriers because we are guaranteed that + * user_ns->binfmt_misc is fully initialized. It was fully initialized when the + * binfmt_misc mount was first created. + * + * Return: struct binfmt_misc of the relevant binfmt_misc instance + */ +static struct binfmt_misc *i_binfmt_misc(struct inode *inode) +{ + return inode->i_sb->s_user_ns->binfmt_misc; +} + +/** + * bm_evict_inode - cleanup data associated with @inode + * @inode: inode to which the data is attached + * + * Cleanup the binary type handler data associated with @inode if a binary type + * entry is removed or the filesystem is unmounted and the super block is + * shutdown. + * + * If the ->evict call was not caused by a super block shutdown but by a write + * to remove the entry or all entries via bm_{entry,status}_write() the entry + * will have already been removed from the list. We keep the list_empty() check + * to make that explicit. +*/ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; - if (e && e->flags & MISC_FMT_OPEN_FILE) - filp_close(e->interp_file, NULL); - clear_inode(inode); - kfree(e); + + if (e) { + struct binfmt_misc *misc; + + misc = i_binfmt_misc(inode); + write_lock(&misc->entries_lock); + if (!list_empty(&e->list)) + list_del_init(&e->list); + write_unlock(&misc->entries_lock); + put_binfmt_handler(e); + } } -static void kill_node(Node *e) +/** + * unlink_binfmt_dentry - remove the dentry for the binary type handler + * @dentry: dentry associated with the binary type handler + * + * Do the actual filesystem work to remove a dentry for a registered binary + * type handler. Since binfmt_misc only allows simple files to be created + * directly under the root dentry of the filesystem we ensure that we are + * indeed passed a dentry directly beneath the root dentry, that the inode + * associated with the root dentry is locked, and that it is a regular file we + * are asked to remove. + */ +static void unlink_binfmt_dentry(struct dentry *dentry) { - struct dentry *dentry; + struct dentry *parent = dentry->d_parent; + struct inode *inode, *parent_inode; - write_lock(&entries_lock); - list_del_init(&e->list); - write_unlock(&entries_lock); + /* All entries are immediate descendants of the root dentry. */ + if (WARN_ON_ONCE(dentry->d_sb->s_root != parent)) + return; - dentry = e->dentry; - drop_nlink(d_inode(dentry)); - d_drop(dentry); - dput(dentry); - simple_release_fs(&bm_mnt, &entry_count); + /* We only expect to be called on regular files. */ + inode = d_inode(dentry); + if (WARN_ON_ONCE(!S_ISREG(inode->i_mode))) + return; + + /* The parent inode must be locked. */ + parent_inode = d_inode(parent); + if (WARN_ON_ONCE(!inode_is_locked(parent_inode))) + return; + + if (simple_positive(dentry)) { + dget(dentry); + simple_unlink(parent_inode, dentry); + d_delete(dentry); + dput(dentry); + } +} + +/** + * remove_binfmt_handler - remove a binary type handler + * @misc: handle to binfmt_misc instance + * @e: binary type handler to remove + * + * Remove a binary type handler from the list of binary type handlers and + * remove its associated dentry. This is called from + * binfmt_{entry,status}_write(). In the future, we might want to think about + * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's + * to use writes to files in order to delete binary type handlers. But it has + * worked for so long that it's not a pressing issue. + */ +static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e) +{ + write_lock(&misc->entries_lock); + list_del_init(&e->list); + write_unlock(&misc->entries_lock); + unlink_binfmt_dentry(e->dentry); } /* /<entry> */ @@ -603,8 +756,8 @@ bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct dentry *root; - Node *e = file_inode(file)->i_private; + struct inode *inode = file_inode(file); + Node *e = inode->i_private; int res = parse_command(buffer, count); switch (res) { @@ -618,13 +771,22 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 3: /* Delete this handler. */ - root = file_inode(file)->i_sb->s_root; - inode_lock(d_inode(root)); + inode = d_inode(inode->i_sb->s_root); + inode_lock(inode); + /* + * In order to add new element or remove elements from the list + * via bm_{entry,register,status}_write() inode_lock() on the + * root inode must be held. + * The lock is exclusive ensuring that the list can't be + * modified. Only load_misc_binary() can access but does so + * read-only. So we only need to take the write lock when we + * actually remove the entry from the list. + */ if (!list_empty(&e->list)) - kill_node(e); + remove_binfmt_handler(i_binfmt_misc(inode), e); - inode_unlock(d_inode(root)); + inode_unlock(inode); break; default: return res; @@ -648,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, struct inode *inode; struct super_block *sb = file_inode(file)->i_sb; struct dentry *root = sb->s_root, *dentry; + struct binfmt_misc *misc; int err = 0; struct file *f = NULL; @@ -657,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); if (e->flags & MISC_FMT_OPEN_FILE) { + const struct cred *old_cred; + + /* + * Now that we support unprivileged binfmt_misc mounts make + * sure we use the credentials that the register @file was + * opened with to also open the interpreter. Before that this + * didn't matter much as only a privileged process could open + * the register file. + */ + old_cred = override_creds(file->f_cred); f = open_exec(e->interpreter); + revert_creds(old_cred); if (IS_ERR(f)) { pr_notice("register: failed to install interpreter file %s\n", e->interpreter); @@ -683,21 +857,16 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, if (!inode) goto out2; - err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count); - if (err) { - iput(inode); - inode = NULL; - goto out2; - } - + refcount_set(&e->users, 1); e->dentry = dget(dentry); inode->i_private = e; inode->i_fop = &bm_entry_operations; d_instantiate(dentry, inode); - write_lock(&entries_lock); - list_add(&e->list, &entries); - write_unlock(&entries_lock); + misc = i_binfmt_misc(inode); + write_lock(&misc->entries_lock); + list_add(&e->list, &misc->entries); + write_unlock(&misc->entries_lock); err = 0; out2: @@ -724,35 +893,50 @@ static const struct file_operations bm_register_operations = { static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - char *s = enabled ? "enabled\n" : "disabled\n"; + struct binfmt_misc *misc; + char *s; + misc = i_binfmt_misc(file_inode(file)); + s = misc->enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } static ssize_t bm_status_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { + struct binfmt_misc *misc; int res = parse_command(buffer, count); - struct dentry *root; + Node *e, *next; + struct inode *inode; + misc = i_binfmt_misc(file_inode(file)); switch (res) { case 1: /* Disable all handlers. */ - enabled = 0; + misc->enabled = false; break; case 2: /* Enable all handlers. */ - enabled = 1; + misc->enabled = true; break; case 3: /* Delete all handlers. */ - root = file_inode(file)->i_sb->s_root; - inode_lock(d_inode(root)); + inode = d_inode(file_inode(file)->i_sb->s_root); + inode_lock(inode); - while (!list_empty(&entries)) - kill_node(list_first_entry(&entries, Node, list)); + /* + * In order to add new element or remove elements from the list + * via bm_{entry,register,status}_write() inode_lock() on the + * root inode must be held. + * The lock is exclusive ensuring that the list can't be + * modified. Only load_misc_binary() can access but does so + * read-only. So we only need to take the write lock when we + * actually remove the entry from the list. + */ + list_for_each_entry_safe(e, next, &misc->entries, list) + remove_binfmt_handler(misc, e); - inode_unlock(d_inode(root)); + inode_unlock(inode); break; default: return res; @@ -769,32 +953,100 @@ static const struct file_operations bm_status_operations = { /* Superblock handling */ +static void bm_put_super(struct super_block *sb) +{ + struct user_namespace *user_ns = sb->s_fs_info; + + sb->s_fs_info = NULL; + put_user_ns(user_ns); +} + static const struct super_operations s_ops = { .statfs = simple_statfs, .evict_inode = bm_evict_inode, + .put_super = bm_put_super, }; static int bm_fill_super(struct super_block *sb, struct fs_context *fc) { int err; + struct user_namespace *user_ns = sb->s_user_ns; + struct binfmt_misc *misc; static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; + if (WARN_ON(user_ns != current_user_ns())) + return -EINVAL; + + /* + * Lazily allocate a new binfmt_misc instance for this namespace, i.e. + * do it here during the first mount of binfmt_misc. We don't need to + * waste memory for every user namespace allocation. It's likely much + * more common to not mount a separate binfmt_misc instance than it is + * to mount one. + * + * While multiple superblocks can exist they are keyed by userns in + * s_fs_info for binfmt_misc. Hence, the vfs guarantees that + * bm_fill_super() is called exactly once whenever a binfmt_misc + * superblock for a userns is created. This in turn lets us conclude + * that when a binfmt_misc superblock is created for the first time for + * a userns there's no one racing us. Therefore we don't need any + * barriers when we dereference binfmt_misc. + */ + misc = user_ns->binfmt_misc; + if (!misc) { + /* + * If it turns out that most user namespaces actually want to + * register their own binary type handler and therefore all + * create their own separate binfm_misc mounts we should + * consider turning this into a kmem cache. + */ + misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL); + if (!misc) + return -ENOMEM; + + INIT_LIST_HEAD(&misc->entries); + rwlock_init(&misc->entries_lock); + + /* Pairs with smp_load_acquire() in load_binfmt_misc(). */ + smp_store_release(&user_ns->binfmt_misc, misc); + } + + /* + * When the binfmt_misc superblock for this userns is shutdown + * ->enabled might have been set to false and we don't reinitialize + * ->enabled again in put_super() as someone might already be mounting + * binfmt_misc again. It also would be pointless since by the time + * ->put_super() is called we know that the binary type list for this + * bintfmt_misc mount is empty making load_misc_binary() return + * -ENOEXEC independent of whether ->enabled is true. Instead, if + * someone mounts binfmt_misc for the first time or again we simply + * reset ->enabled to true. + */ + misc->enabled = true; + err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; } +static void bm_free(struct fs_context *fc) +{ + if (fc->s_fs_info) + put_user_ns(fc->s_fs_info); +} + static int bm_get_tree(struct fs_context *fc) { - return get_tree_single(fc, bm_fill_super); + return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns)); } static const struct fs_context_operations bm_context_ops = { + .free = bm_free, .get_tree = bm_get_tree, }; @@ -813,6 +1065,7 @@ static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", .init_fs_context = bm_init_fs_context, + .fs_flags = FS_USERNS_MOUNT, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("binfmt_misc"); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 66fa9ab2c046..4fb925e8c981 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,7 +31,7 @@ config BTRFS_FS continue to be mountable and usable by newer kernels. For more information, please see the web pages at - http://btrfs.wiki.kernel.org. + https://btrfs.readthedocs.io To compile this file system support as a module, choose M here. The module will be called btrfs. @@ -48,25 +48,6 @@ config BTRFS_FS_POSIX_ACL If you don't know what Access Control Lists are, say N -config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DANGEROUS)" - depends on BTRFS_FS - help - Adds code that examines all block write requests (including - writes of the super block). The goal is to verify that the - state of the filesystem on disk is always consistent, i.e., - after a power-loss or kernel panic event the filesystem is - in a consistent state. - - If the integrity check tool is included and activated in - the mount options, plenty of kernel memory is used, and - plenty of additional CPU cycles are spent. Enabling this - functionality is not intended for normal use. - - In most cases, unless you are a btrfs developer who needs - to verify the integrity of (super)-block write requests - during the run of a regression test, say N - config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 90d53209755b..525af975f61c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ - lru_cache.o + lru_cache.o raid-stripe-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ceadfc5d6c66..aa0844535644 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,6 +3,9 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H +#include <linux/stddef.h> +#include <asm/unaligned.h> + struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; @@ -34,13 +37,13 @@ static inline void put_unaligned_le8(u8 val, void *p) read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define write_eb_member(eb, ptr, type, member, result) (\ write_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ @@ -62,25 +65,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -111,17 +114,14 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } @@ -306,6 +306,14 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8); +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding, + struct btrfs_stripe_extent, encoding, 8); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64); + /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, @@ -350,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3 BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); +BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref, + root_id, 64); + BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, @@ -366,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type) if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); return 0; } @@ -967,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); +BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item, + enable_gen, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ce083e99ef68..9e261aac671e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -9,6 +9,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <trace/events/btrfs.h> #include "async-thread.h" #include "ctree.h" @@ -242,7 +243,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, break; trace_btrfs_ordered_sched(work); spin_unlock_irqrestore(lock, flags); - work->ordered_func(work); + work->ordered_func(work, false); /* now take the lock again and drop our item from the list */ spin_lock_irqsave(lock, flags); @@ -277,7 +278,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, * We don't want to call the ordered free functions with * the lock held. */ - work->ordered_free(work); + work->ordered_func(work, true); /* NB: work must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, work); } @@ -285,7 +286,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_unlock_irqrestore(lock, flags); if (free_self) { - self->ordered_free(self); + self->ordered_func(self, true); /* NB: self must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, self); } @@ -300,7 +301,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) /* * We should not touch things inside work in the following cases: - * 1) after work->func() if it has no ordered_free + * 1) after work->func() if it has no ordered_func(..., true) to free * Since the struct is freed in work->func(). * 2) after setting WORK_DONE_BIT * The work may be freed in other threads almost instantly. @@ -329,11 +330,10 @@ static void btrfs_work_helper(struct work_struct *normal_work) } void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free) + btrfs_ordered_func_t ordered_func) { work->func = func; work->ordered_func = ordered_func; - work->ordered_free = ordered_free; INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 30f66c5e2e6e..62b8a0d57898 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -13,11 +13,11 @@ struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); struct btrfs_work { btrfs_func_t func; - btrfs_func_t ordered_func; - btrfs_func_t ordered_free; + btrfs_ordered_func_t ordered_func; /* Don't touch things below */ struct work_struct normal_work; @@ -35,7 +35,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( struct btrfs_fs_info *fs_info, const char *name, unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free); + btrfs_ordered_func_t ordered_func); void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 79336fa853db..beed7e459dab 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, count, sc, GFP_NOFS); break; } + case BTRFS_EXTENT_OWNER_REF_KEY: + ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA)); + break; default: WARN_ON(1); } @@ -2998,7 +3001,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) } void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc) + struct btrfs_backref_cache *cache, bool is_reloc) { int i; @@ -3196,12 +3199,14 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, * We still need to do a tree search to find out the parents. This is for * TREE_BLOCK_REF backref (keyed or inlined). * + * @trans: Transaction handle. * @ref_key: The same as @ref_key in handle_direct_tree_backref() * @tree_key: The first key of this tree block. * @path: A clean (released) path, to avoid allocating path every time * the function get called. */ -static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, +static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_key *ref_key, struct btrfs_key *tree_key, @@ -3315,7 +3320,7 @@ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, * If we know the block isn't shared we can avoid * checking its backrefs. */ - if (btrfs_block_can_be_shared(root, eb)) + if (btrfs_block_can_be_shared(trans, root, eb)) upper->checked = 0; else upper->checked = 1; @@ -3363,17 +3368,18 @@ out: * links aren't yet bi-directional. Needs to finish such links. * Use btrfs_backref_finish_upper_links() to finish such linkage. * + * @trans: Transaction handle. * @path: Released path for indirect tree backref lookup * @iter: Released backref iter for extent tree search * @node_key: The first key of the tree block */ -int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, struct btrfs_backref_node *cur) { - struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_backref_edge *edge; struct btrfs_backref_node *exist; int ret; @@ -3462,25 +3468,21 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, ret = handle_direct_tree_backref(cache, &key, cur); if (ret < 0) goto out; - continue; - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, ret, NULL); - goto out; - } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { - continue; + } else if (key.type == BTRFS_TREE_BLOCK_REF_KEY) { + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref + * offset means the root objectid. We need to search + * the tree to get its parent bytenr. + */ + ret = handle_indirect_tree_backref(trans, cache, path, + &key, node_key, cur); + if (ret < 0) + goto out; } - /* - * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset - * means the root objectid. We need to search the tree to get - * its parent bytenr. + * Unrecognized tree backref items (if it can pass tree-checker) + * would be ignored. */ - ret = handle_indirect_tree_backref(cache, path, &key, node_key, - cur); - if (ret < 0) - goto out; } ret = 0; cur->checked = 1; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 1616e3e3f1e4..ab4ca0eda605 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -247,7 +247,7 @@ struct prelim_ref { struct rb_node rbnode; u64 root_id; struct btrfs_key key_for_search; - int level; + u8 level; int count; struct extent_inode_elem *inode_list; u64 parent; @@ -440,11 +440,11 @@ struct btrfs_backref_cache { * Reloction backref cache require more info for reloc root compared * to generic backref cache. */ - unsigned int is_reloc; + bool is_reloc; }; void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc); + struct btrfs_backref_cache *cache, bool is_reloc); struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_cache *cache, u64 bytenr, int level); struct btrfs_backref_edge *btrfs_backref_alloc_edge( @@ -533,14 +533,15 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info, - u64 bytenr, int errno) + u64 bytenr, int error) { - btrfs_panic(fs_info, errno, + btrfs_panic(fs_info, error, "Inconsistency in backref cache found at offset %llu", bytenr); } -int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 12b12443efaa..4f3b693a16b1 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -10,11 +10,11 @@ #include "volumes.h" #include "raid56.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" #include "file-item.h" +#include "raid-stripe-tree.h" static struct bio_set btrfs_bioset; static struct bio_set btrfs_clone_bioset; @@ -416,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -427,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); + } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } /* Pass on control to the original bio this one was cloned from */ @@ -463,8 +468,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); - btrfsic_check_bio(bio); - if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -490,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; bioc->stripes[dev_nr].bioc = bioc; + bioc->size = bio->bi_iter.bi_size; btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } @@ -499,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; if (bio_op(bio) != REQ_OP_READ) btrfs_bio(bio)->orig_physical = smap->physical; @@ -568,13 +574,20 @@ static void run_one_async_start(struct btrfs_work *work) * * At IO completion time the csums attached on the ordered extent record are * inserted into the tree. + * + * If called with @do_free == true, then it will free the work struct. */ -static void run_one_async_done(struct btrfs_work *work) +static void run_one_async_done(struct btrfs_work *work, bool do_free) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); struct bio *bio = &async->bbio->bio; + if (do_free) { + kfree(container_of(work, struct async_submit_bio, work)); + return; + } + /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { btrfs_orig_bbio_end_io(async->bbio); @@ -590,11 +603,6 @@ static void run_one_async_done(struct btrfs_work *work) __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } -static void run_one_async_free(struct btrfs_work *work) -{ - kfree(container_of(work, struct async_submit_bio, work)); -} - static bool should_async_write(struct btrfs_bio *bbio) { /* Submit synchronously if the checksum implementation is fast. */ @@ -636,8 +644,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, async->smap = *smap; async->mirror_num = mirror_num; - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done); btrfs_queue_work(fs_info->workers, &async->work); return true; } @@ -657,9 +664,11 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; + smap.is_scrub = !bbio->inode; + btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); goto fail; @@ -691,6 +700,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } + if (is_data_bbio(bbio) && bioc && + btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + /* + * No locking for the list update, as we only add to + * the list in the I/O submission path, and list + * iteration only happens in the completion path, which + * can't happen until after the last submission. + */ + btrfs_get_bioc(bioc); + list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list); + } + /* * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. @@ -779,8 +800,6 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 82324c327a50..6e5dc68ff661 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -504,13 +504,20 @@ static void fragment_free_space(struct btrfs_block_group *block_group) #endif /* - * This is only called by btrfs_cache_block_group, since we could have freed - * extents we need to check the pinned_extents for any extents that can't be - * used yet since their free space will be released as soon as the transaction - * commits. + * Add a free space range to the in memory free space cache of a block group. + * This checks if the range contains super block locations and any such + * locations are not added to the free space cache. + * + * @block_group: The target block group. + * @start: Start offset of the range. + * @end: End offset of the range (exclusive). + * @total_added_ret: Optional pointer to return the total amount of space + * added to the block group's free space cache. + * + * Returns 0 on success or < 0 on error. */ -int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, - u64 *total_added_ret) +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, + u64 end, u64 *total_added_ret) { struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size; @@ -520,11 +527,10 @@ int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end *total_added_ret = 0; while (start < end) { - ret = find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL); - if (ret) + if (!find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL)) break; if (extent_start <= start) { @@ -799,8 +805,8 @@ next: key.type == BTRFS_METADATA_ITEM_KEY) { u64 space_added; - ret = add_new_free_space(block_group, last, key.objectid, - &space_added); + ret = btrfs_add_new_free_space(block_group, last, + key.objectid, &space_added); if (ret) goto out; total_found += space_added; @@ -821,14 +827,20 @@ next: path->slots[0]++; } - ret = add_new_free_space(block_group, last, - block_group->start + block_group->length, - NULL); + ret = btrfs_add_new_free_space(block_group, last, + block_group->start + block_group->length, + NULL); out: btrfs_free_path(path); return ret; } +static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) +{ + clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_UPTODATE); +} + static noinline void caching_thread(struct btrfs_work *work) { struct btrfs_block_group *block_group; @@ -923,7 +935,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) caching_ctl->block_group = cache; refcount_set(&caching_ctl->count, 2); atomic_set(&caching_ctl->progress, 0); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL); spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -1274,7 +1286,7 @@ out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); if (remove_rsv) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); btrfs_free_path(path); return ret; } @@ -2098,8 +2110,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = btrfs_add_excluded_extent(fs_info, cache->start, - stripe_len); + ret = set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_UPTODATE, NULL); if (ret) return ret; } @@ -2125,8 +2138,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = btrfs_add_excluded_extent(fs_info, logical[nr], - len); + ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], + logical[nr] + len - 1, + EXTENT_UPTODATE, NULL); if (ret) { kfree(logical); return ret; @@ -2319,8 +2333,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->cached = BTRFS_CACHE_FINISHED; - ret = add_new_free_space(cache, cache->start, - cache->start + cache->length, NULL); + ret = btrfs_add_new_free_space(cache, cache->start, + cache->start + cache->length, NULL); btrfs_free_excluded_extents(cache); if (ret) goto error; @@ -2587,7 +2601,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -2695,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); } @@ -2767,7 +2781,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); + ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); btrfs_free_excluded_extents(cache); if (ret) { btrfs_put_block_group(cache); @@ -2805,8 +2819,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran #endif list_add_tail(&cache->bg_list, &trans->new_bgs); - trans->delayed_ref_updates++; - btrfs_update_delayed_refs_rsv(trans); + btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); return cache; @@ -3011,11 +3024,19 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); - /* We didn't update the block group item, need to revert @commit_used. */ - if (ret < 0) { + /* + * We didn't update the block group item, need to revert commit_used + * unless the block group item didn't exist yet - this is to prevent a + * race with a concurrent insertion of the block group item, with + * insert_block_group_item(), that happened just after we attempted to + * update. In that case we would reset commit_used to 0 just after the + * insertion set it to a value greater than 0 - if the block group later + * becomes with 0 used bytes, we would incorrectly skip its update. + */ + if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->commit_used = old_commit_used; spin_unlock(&cache->lock); @@ -3029,7 +3050,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; @@ -3081,7 +3101,7 @@ again: * time. */ BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the @@ -3348,7 +3368,7 @@ again: if (should_put) btrfs_put_block_group(cache); if (drop_reserve) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be @@ -3452,8 +3472,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); if (!ret) - ret = btrfs_run_delayed_refs(trans, - (unsigned long) -1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; @@ -3496,7 +3515,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) /* If its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -3521,12 +3540,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_block_group *cache = NULL; - u64 total = num_bytes; + struct btrfs_space_info *space_info; + struct btrfs_block_group *cache; u64 old_val; - u64 byte_in_group; + bool reclaim = false; + bool bg_already_dirty = true; int factor; - int ret = 0; /* Block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -3538,97 +3557,86 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_root_lock); - while (total) { - struct btrfs_space_info *space_info; - bool reclaim = false; - - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { - ret = -ENOENT; - break; - } - space_info = cache->space_info; - factor = btrfs_bg_type_to_factor(cache->flags); + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -ENOENT; - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && !btrfs_block_group_done(cache)) - btrfs_cache_block_group(cache, true); + /* An extent can not span multiple block groups. */ + ASSERT(bytenr + num_bytes <= cache->start + cache->length); - byte_in_group = bytenr - cache->start; - WARN_ON(byte_in_group > cache->length); + space_info = cache->space_info; + factor = btrfs_bg_type_to_factor(cache->flags); - spin_lock(&space_info->lock); - spin_lock(&cache->lock); + /* + * If this block group has free space cache written out, we need to make + * sure to load it if we are removing space. This is because we need + * the unpinning stage to actually add the space back to the block group, + * otherwise we will leak space. + */ + if (!alloc && !btrfs_block_group_done(cache)) + btrfs_cache_block_group(cache, true); - if (btrfs_test_opt(info, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); - old_val = cache->used; - num_bytes = min(total, cache->length - byte_in_group); - if (alloc) { - old_val += num_bytes; - cache->used = old_val; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->bytes_used += num_bytes; - space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - old_val -= num_bytes; - cache->used = old_val; - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, - num_bytes); - space_info->bytes_used -= num_bytes; - space_info->disk_used -= num_bytes * factor; + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; - reclaim = should_reclaim_block_group(cache, num_bytes); + old_val = cache->used; + if (alloc) { + old_val += num_bytes; + cache->used = old_val; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->bytes_used += num_bytes; + space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + } else { + old_val -= num_bytes; + cache->used = old_val; + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + space_info->bytes_used -= num_bytes; + space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); + reclaim = should_reclaim_block_group(cache, num_bytes); - set_extent_bit(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - EXTENT_DIRTY, NULL); - } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - trans->delayed_ref_updates++; - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + } - /* - * No longer have used bytes in this block group, queue it for - * deletion. We do this after adding the block group to the - * dirty list to avoid races between cleaner kthread and space - * cache writeout. - */ - if (!alloc && old_val == 0) { - if (!btrfs_test_opt(info, DISCARD_ASYNC)) - btrfs_mark_bg_unused(cache); - } else if (!alloc && reclaim) { - btrfs_mark_bg_to_reclaim(cache); - } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; + /* + * No longer have used bytes in this block group, queue it for deletion. + * We do this after adding the block group to the dirty list to avoid + * races between cleaner kthread and space cache writeout. + */ + if (!alloc && old_val == 0) { + if (!btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } + btrfs_put_block_group(cache); + /* Modified block groups are accounted for in the delayed_refs_rsv. */ - btrfs_update_delayed_refs_rsv(trans); - return ret; + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(info); + + return 0; } /* @@ -4075,7 +4083,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (IS_ERR(ret_bg)) { ret = PTR_ERR(ret_bg); - } else if (from_extent_allocation) { + } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) { /* * New block group is likely to be used soon. Try to activate * it now. Failure is OK for now. @@ -4273,6 +4281,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; + if (btrfs_is_zoned(info)) { + if (info->active_meta_bg) { + btrfs_put_block_group(info->active_meta_bg); + info->active_meta_bg = NULL; + } + if (info->active_system_bg) { + btrfs_put_block_group(info->active_system_bg); + info->active_system_bg = NULL; + } + } + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 74b61e663028..2bdbcb834f95 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -291,8 +291,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); -int add_new_free_space(struct btrfs_block_group *block_group, - u64 start, u64 end, u64 *total_added_ret); +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 77684c5e0c8b..ceb5f586a2d5 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -221,7 +221,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -261,7 +262,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -279,10 +281,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *target = NULL; /* - * If we are the delayed_rsv then push to the global rsv, otherwise dump - * into the delayed rsv if it is not full. + * If we are a delayed block reserve then push to the global rsv, + * otherwise dump into the global delayed reserve if it is not full. */ - if (block_rsv == delayed_rsv) + if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS) target = global_rsv; else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) target = delayed_rsv; @@ -354,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) min_items++; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item); + min_items++; + } + /* * But we also want to reserve enough space so we can do the fallback * global reserve for an unlink, which is an additional @@ -405,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + case BTRFS_RAID_STRIPE_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: @@ -517,8 +525,8 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* @@ -539,7 +547,7 @@ try_reserve: * one last time to force a reservation if there's enough actual space * on disk to make the reservation. */ - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize, BTRFS_RESERVE_FLUSH_EMERGENCY); if (!ret) return block_rsv; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d47a927b3504..5572ae52444e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,6 +8,8 @@ #include <linux/hash.h> #include <linux/refcount.h> +#include <linux/fscrypt.h> +#include <trace/events/btrfs.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" @@ -79,11 +81,21 @@ struct btrfs_inode { */ struct btrfs_key location; + /* Cached value of inode property 'compression'. */ + u8 prop_compress; + + /* + * Force compression on the file using the defrag ioctl, could be + * different from prop_compress and takes precedence if set. + */ + u8 defrag_compress; + /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, - * logged_trans), to access/update new_delalloc_bytes and to update the - * VFS' inode number of bytes used. + * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes, + * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to + * update the VFS' inode number of bytes used. */ spinlock_t lock; @@ -102,8 +114,18 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. Protected by 'lock'. + */ + unsigned outstanding_extents; + /* used to order data wrt metadata */ - struct btrfs_ordered_inode_tree ordered_tree; + spinlock_t ordered_tree_lock; + struct rb_root ordered_tree; + struct rb_node *ordered_tree_last; /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used @@ -122,28 +144,31 @@ struct btrfs_inode { u64 generation; /* - * transid of the trans_handle that last modified this inode + * ID of the transaction handle that last modified this inode. + * Protected by 'lock'. */ u64 last_trans; /* - * transid that last logged this inode + * ID of the transaction that last logged this inode. + * Protected by 'lock'. */ u64 logged_trans; /* - * log transid when this inode was last modified + * Log transaction ID when this inode was last modified. + * Protected by 'lock'. */ int last_sub_trans; - /* a local copy of root's last_log_commit */ + /* A local copy of root's last_log_commit. Protected by 'lock'. */ int last_log_commit; union { /* * Total number of bytes pending delalloc, used by stat to * calculate the real block usage of the file. This is used - * only for files. + * only for files. Protected by 'lock'. */ u64 delalloc_bytes; /* @@ -161,7 +186,7 @@ struct btrfs_inode { * Total number of bytes pending delalloc that fall within a file * range that is either a hole or beyond EOF (and no prealloc extent * exists in the range). This is always <= delalloc_bytes and this - * is used only for files. + * is used only for files. Protected by 'lock'. */ u64 new_delalloc_bytes; /* @@ -172,15 +197,15 @@ struct btrfs_inode { }; /* - * total number of bytes pending defrag, used by stat to check whether - * it needs COW. + * Total number of bytes pending defrag, used by stat to check whether + * it needs COW. Protected by 'lock'. */ u64 defrag_bytes; /* - * the size of the file stored in the metadata on disk. data=ordered + * The size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk - * because not all the blocks are written yet. + * because not all the blocks are written yet. Protected by 'lock'. */ u64 disk_i_size; @@ -214,7 +239,7 @@ struct btrfs_inode { /* * Number of bytes outstanding that are going to need csums. This is - * used in ENOSPC accounting. + * used in ENOSPC accounting. Protected by 'lock'. */ u64 csum_bytes; @@ -223,30 +248,13 @@ struct btrfs_inode { /* Read-only compatibility flags, upper half of inode_item::flags */ u32 ro_flags; - /* - * Counters to keep track of the number of extent item's we may use due - * to delalloc and such. outstanding_extents is the number of extent - * items we think we'll end up using, and reserved_extents is the number - * of extent items we've reserved metadata for. - */ - unsigned outstanding_extents; - struct btrfs_block_rsv block_rsv; - /* - * Cached values of inode properties - */ - unsigned prop_compress; /* per-file compression algorithm */ - /* - * Force compression on the file using the defrag ioctl, could be - * different from prop_compress and takes precedence if set - */ - unsigned defrag_compress; - struct btrfs_delayed_node *delayed_node; /* File creation time. */ - struct timespec64 i_otime; + u64 i_otime_sec; + u32 i_otime_nsec; /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; @@ -387,7 +395,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) + inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root)) ret = true; spin_unlock(&inode->lock); return ret; @@ -481,9 +489,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); @@ -498,12 +506,8 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, struct writeback_control *wbc); + u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c deleted file mode 100644 index 3caf339c4bb3..000000000000 --- a/fs/btrfs/check-integrity.c +++ /dev/null @@ -1,2871 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -/* - * This module can be used to catch cases when the btrfs kernel - * code executes write requests to the disk that bring the file - * system in an inconsistent state. In such a state, a power-loss - * or kernel panic event would cause that the data on disk is - * lost or at least damaged. - * - * Code is added that examines all block write requests during - * runtime (including writes of the super block). Three rules - * are verified and an error is printed on violation of the - * rules: - * 1. It is not allowed to write a disk block which is - * currently referenced by the super block (either directly - * or indirectly). - * 2. When a super block is written, it is verified that all - * referenced (directly or indirectly) blocks fulfill the - * following requirements: - * 2a. All referenced blocks have either been present when - * the file system was mounted, (i.e., they have been - * referenced by the super block) or they have been - * written since then and the write completion callback - * was called and no write error was indicated and a - * FLUSH request to the device where these blocks are - * located was received and completed. - * 2b. All referenced blocks need to have a generation - * number which is equal to the parent's number. - * - * One issue that was found using this module was that the log - * tree on disk became temporarily corrupted because disk blocks - * that had been in use for the log tree had been freed and - * reused too early, while being referenced by the written super - * block. - * - * The search term in the kernel log that can be used to filter - * on the existence of detected integrity issues is - * "btrfs: attempt". - * - * The integrity check is enabled via mount options. These - * mount options are only supported if the integrity check - * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. - * - * Example #1, apply integrity checks to all metadata: - * mount /dev/sdb1 /mnt -o check_int - * - * Example #2, apply integrity checks to all metadata and - * to data extents: - * mount /dev/sdb1 /mnt -o check_int_data - * - * Example #3, apply integrity checks to all metadata and dump - * the tree that the super block references to kernel messages - * each time after a super block was written: - * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 - * - * If the integrity check tool is included and activated in - * the mount options, plenty of kernel memory is used, and - * plenty of additional CPU cycles are spent. Enabling this - * functionality is not intended for normal use. In most - * cases, unless you are a btrfs developer who needs to verify - * the integrity of (super)-block write requests, do not - * enable the config option BTRFS_FS_CHECK_INTEGRITY to - * include and compile the integrity check tool. - * - * Expect millions of lines of information in the kernel log with an - * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the - * kernel config to at least 26 (which is 64MB). Usually the value is - * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be - * changed like this before LOG_BUF_SHIFT can be set to a high value: - * config LOG_BUF_SHIFT - * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" - * range 12 30 - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/mutex.h> -#include <linux/blkdev.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <crypto/hash.h> -#include "messages.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "extent_io.h" -#include "volumes.h" -#include "print-tree.h" -#include "locking.h" -#include "check-integrity.h" -#include "rcu-string.h" -#include "compression.h" -#include "accessors.h" - -#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 -#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 -#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 -#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 -#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 -#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, - * excluding " [...]" */ -#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) - -/* - * The definition of the bitmask fields for the print_mask. - * They are specified with the mount option check_integrity_print_mask. - */ -#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 -#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 -#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 -#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 -#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 -#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 -#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 -#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 -#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 -#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 -#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 -#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 - -struct btrfsic_dev_state; -struct btrfsic_state; - -struct btrfsic_block { - u32 magic_num; /* only used for debug purposes */ - unsigned int is_metadata:1; /* if it is meta-data, not data-data */ - unsigned int is_superblock:1; /* if it is one of the superblocks */ - unsigned int is_iodone:1; /* if is done by lower subsystem */ - unsigned int iodone_w_error:1; /* error was indicated to endio */ - unsigned int never_written:1; /* block was added because it was - * referenced, not because it was - * written */ - unsigned int mirror_num; /* large enough to hold - * BTRFS_SUPER_MIRROR_MAX */ - struct btrfsic_dev_state *dev_state; - u64 dev_bytenr; /* key, physical byte num on disk */ - u64 logical_bytenr; /* logical byte num on disk */ - u64 generation; - struct btrfs_disk_key disk_key; /* extra info to print in case of - * issues, will not always be correct */ - struct list_head collision_resolving_node; /* list node */ - struct list_head all_blocks_node; /* list node */ - - /* the following two lists contain block_link items */ - struct list_head ref_to_list; /* list */ - struct list_head ref_from_list; /* list */ - struct btrfsic_block *next_in_same_bio; - void *orig_bio_private; - bio_end_io_t *orig_bio_end_io; - blk_opf_t submit_bio_bh_rw; - u64 flush_gen; /* only valid if !never_written */ -}; - -/* - * Elements of this type are allocated dynamically and required because - * each block object can refer to and can be ref from multiple blocks. - * The key to lookup them in the hashtable is the dev_bytenr of - * the block ref to plus the one from the block referred from. - * The fact that they are searchable via a hashtable and that a - * ref_cnt is maintained is not required for the btrfs integrity - * check algorithm itself, it is only used to make the output more - * beautiful in case that an error is detected (an error is defined - * as a write operation to a block while that block is still referenced). - */ -struct btrfsic_block_link { - u32 magic_num; /* only used for debug purposes */ - u32 ref_cnt; - struct list_head node_ref_to; /* list node */ - struct list_head node_ref_from; /* list node */ - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block *block_ref_to; - struct btrfsic_block *block_ref_from; - u64 parent_generation; -}; - -struct btrfsic_dev_state { - u32 magic_num; /* only used for debug purposes */ - struct block_device *bdev; - struct btrfsic_state *state; - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block dummy_block_for_bio_bh_flush; - u64 last_flush_gen; -}; - -struct btrfsic_block_hashtable { - struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_link_hashtable { - struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; -}; - -struct btrfsic_dev_state_hashtable { - struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_data_ctx { - u64 start; /* virtual bytenr */ - u64 dev_bytenr; /* physical bytenr on device */ - u32 len; - struct btrfsic_dev_state *dev; - char **datav; - struct page **pagev; - void *mem_to_free; -}; - -/* This structure is used to implement recursion without occupying - * any stack space, refer to btrfsic_process_metablock() */ -struct btrfsic_stack_frame { - u32 magic; - u32 nr; - int error; - int i; - int limit_nesting; - int num_copies; - int mirror_num; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx *block_ctx; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfs_header *hdr; - struct btrfsic_stack_frame *prev; -}; - -/* Some state per mounted filesystem */ -struct btrfsic_state { - u32 print_mask; - int include_extent_data; - struct list_head all_blocks_list; - struct btrfsic_block_hashtable block_hashtable; - struct btrfsic_block_link_hashtable block_link_hashtable; - struct btrfs_fs_info *fs_info; - u64 max_superblock_generation; - struct btrfsic_block *latest_superblock; - u32 metablock_size; - u32 datablock_size; -}; - -static int btrfsic_process_metablock(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - int limit_nesting, int force_iodone_flag); -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dst, u32 offset, size_t len); -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx - *block_ctx, u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation); -static int btrfsic_handle_extent_data(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag); -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num); -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const block, - struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp); -static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level); -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level); -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block); -static void btrfsic_dump_tree(const struct btrfsic_state *state); -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level); -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation); -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created); -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr); - -static struct mutex btrfsic_mutex; -static int btrfsic_is_initialized; -static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; - - -static void btrfsic_block_init(struct btrfsic_block *b) -{ - b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; - b->dev_state = NULL; - b->dev_bytenr = 0; - b->logical_bytenr = 0; - b->generation = BTRFSIC_GENERATION_UNKNOWN; - b->disk_key.objectid = 0; - b->disk_key.type = 0; - b->disk_key.offset = 0; - b->is_metadata = 0; - b->is_superblock = 0; - b->is_iodone = 0; - b->iodone_w_error = 0; - b->never_written = 0; - b->mirror_num = 0; - b->next_in_same_bio = NULL; - b->orig_bio_private = NULL; - b->orig_bio_end_io = NULL; - INIT_LIST_HEAD(&b->collision_resolving_node); - INIT_LIST_HEAD(&b->all_blocks_node); - INIT_LIST_HEAD(&b->ref_to_list); - INIT_LIST_HEAD(&b->ref_from_list); - b->submit_bio_bh_rw = 0; - b->flush_gen = 0; -} - -static struct btrfsic_block *btrfsic_block_alloc(void) -{ - struct btrfsic_block *b; - - b = kzalloc(sizeof(*b), GFP_NOFS); - if (NULL != b) - btrfsic_block_init(b); - - return b; -} - -static void btrfsic_block_free(struct btrfsic_block *b) -{ - BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); - kfree(b); -} - -static void btrfsic_block_link_init(struct btrfsic_block_link *l) -{ - l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; - l->ref_cnt = 1; - INIT_LIST_HEAD(&l->node_ref_to); - INIT_LIST_HEAD(&l->node_ref_from); - INIT_LIST_HEAD(&l->collision_resolving_node); - l->block_ref_to = NULL; - l->block_ref_from = NULL; -} - -static struct btrfsic_block_link *btrfsic_block_link_alloc(void) -{ - struct btrfsic_block_link *l; - - l = kzalloc(sizeof(*l), GFP_NOFS); - if (NULL != l) - btrfsic_block_link_init(l); - - return l; -} - -static void btrfsic_block_link_free(struct btrfsic_block_link *l) -{ - BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); - kfree(l); -} - -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) -{ - ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; - ds->bdev = NULL; - ds->state = NULL; - INIT_LIST_HEAD(&ds->collision_resolving_node); - ds->last_flush_gen = 0; - btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); - ds->dummy_block_for_bio_bh_flush.is_iodone = 1; - ds->dummy_block_for_bio_bh_flush.dev_state = ds; -} - -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) -{ - struct btrfsic_dev_state *ds; - - ds = kzalloc(sizeof(*ds), GFP_NOFS); - if (NULL != ds) - btrfsic_dev_state_init(ds); - - return ds; -} - -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) -{ - BUG_ON(!(NULL == ds || - BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); - kfree(ds); -} - -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(b->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)b->dev_state->bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - - list_add(&b->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) -{ - list_del(&b->collision_resolving_node); -} - -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct btrfsic_block *b; - - list_for_each_entry(b, h->table + hashval, collision_resolving_node) { - if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) - return b; - } - - return NULL; -} - -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ - ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ - ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) - & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - list_add(&l->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) -{ - list_del(&l->collision_resolving_node); -} - -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ - ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ - ((unsigned int)((uintptr_t)bdev_ref_to)) ^ - ((unsigned int)((uintptr_t)bdev_ref_from))) & - (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct btrfsic_block_link *l; - - list_for_each_entry(l, h->table + hashval, collision_resolving_node) { - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - if (l->block_ref_to->dev_state->bdev == bdev_ref_to && - l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && - l->block_ref_from->dev_state->bdev == bdev_ref_from && - l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) - return l; - } - - return NULL; -} - -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - - list_add(&ds->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) -{ - list_del(&ds->collision_resolving_node); -} - -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); - struct btrfsic_dev_state *ds; - - list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { - if (ds->bdev->bd_dev == dev) - return ds; - } - - return NULL; -} - -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_super_block *selected_super; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - struct btrfsic_dev_state *selected_dev_state = NULL; - int ret = 0; - int pass; - - selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); - if (!selected_super) - return -ENOMEM; - - list_for_each_entry(device, dev_head, dev_list) { - int i; - struct btrfsic_dev_state *dev_state; - - if (!device->bdev || !device->name) - continue; - - dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); - BUG_ON(NULL == dev_state); - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - ret = btrfsic_process_superblock_dev_mirror( - state, dev_state, device, i, - &selected_dev_state, selected_super); - if (0 != ret && 0 == i) { - kfree(selected_super); - return ret; - } - } - } - - if (NULL == state->latest_superblock) { - pr_info("btrfsic: no superblock found!\n"); - kfree(selected_super); - return -1; - } - - for (pass = 0; pass < 3; pass++) { - int num_copies; - int mirror_num; - u64 next_bytenr; - - switch (pass) { - case 0: - next_bytenr = btrfs_super_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - next_bytenr = btrfs_super_chunk_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - next_bytenr = btrfs_super_log_root(selected_super); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(state->fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - kfree(selected_super); - return -1; - } - - next_block = btrfsic_block_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - &state->block_hashtable); - BUG_ON(NULL == next_block); - - l = btrfsic_block_link_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - state->latest_superblock->dev_state-> - bdev, - state->latest_superblock->dev_bytenr, - &state->block_link_hashtable); - BUG_ON(NULL == l); - - ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)PAGE_SIZE) { - pr_info("btrfsic: read @logical %llu failed!\n", - tmp_next_block_ctx.start); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - kfree(selected_super); - return -1; - } - - ret = btrfsic_process_metablock(state, - next_block, - &tmp_next_block_ctx, - BTRFS_MAX_LEVEL + 3, 1); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - } - } - - kfree(selected_super); - return ret; -} - -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_super_block *super_tmp; - u64 dev_bytenr; - struct btrfsic_block *superblock_tmp; - int pass; - struct block_device *const superblock_bdev = device->bdev; - struct page *page; - struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; - int ret = 0; - - /* super block bytenr is always the unmapped device bytenr */ - dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) - return -1; - - page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return -1; - - super_tmp = page_address(page); - - if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - btrfs_super_magic(super_tmp) != BTRFS_MAGIC || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || - btrfs_super_nodesize(super_tmp) != state->metablock_size || - btrfs_super_sectorsize(super_tmp) != state->datablock_size) { - ret = 0; - goto out; - } - - superblock_tmp = - btrfsic_block_hashtable_lookup(superblock_bdev, - dev_bytenr, - &state->block_hashtable); - if (NULL == superblock_tmp) { - superblock_tmp = btrfsic_block_alloc(); - if (NULL == superblock_tmp) { - ret = -1; - goto out; - } - /* for superblock, only the dev_bytenr makes sense */ - superblock_tmp->dev_bytenr = dev_bytenr; - superblock_tmp->dev_state = dev_state; - superblock_tmp->logical_bytenr = dev_bytenr; - superblock_tmp->generation = btrfs_super_generation(super_tmp); - superblock_tmp->is_metadata = 1; - superblock_tmp->is_superblock = 1; - superblock_tmp->is_iodone = 1; - superblock_tmp->never_written = 0; - superblock_tmp->mirror_num = 1 + superblock_mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - btrfs_info_in_rcu(fs_info, - "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", - superblock_bdev, - btrfs_dev_name(device), dev_bytenr, - dev_state->bdev, dev_bytenr, - superblock_mirror_num); - list_add(&superblock_tmp->all_blocks_node, - &state->all_blocks_list); - btrfsic_block_hashtable_add(superblock_tmp, - &state->block_hashtable); - } - - /* select the one with the highest generation field */ - if (btrfs_super_generation(super_tmp) > - state->max_superblock_generation || - 0 == state->max_superblock_generation) { - memcpy(selected_super, super_tmp, sizeof(*selected_super)); - *selected_dev_state = dev_state; - state->max_superblock_generation = - btrfs_super_generation(super_tmp); - state->latest_superblock = superblock_tmp; - } - - for (pass = 0; pass < 3; pass++) { - u64 next_bytenr; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "initial root "; - next_bytenr = btrfs_super_root(super_tmp); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "initial chunk "; - next_bytenr = btrfs_super_chunk_root(super_tmp); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "initial log "; - next_bytenr = btrfs_super_log_root(super_tmp); - if (0 == next_bytenr) - continue; - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - if (btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num)) { - pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - ret = -1; - goto out; - } - - next_block = btrfsic_block_lookup_or_add( - state, &tmp_next_block_ctx, - additional_string, 1, 1, 0, - mirror_num, NULL); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - ret = -1; - goto out; - } - - next_block->disk_key = tmp_disk_key; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, &tmp_next_block_ctx, - next_block, superblock_tmp, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) { - ret = -1; - goto out; - } - } - } - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) - btrfsic_dump_tree_sub(state, superblock_tmp, 0); - -out: - put_page(page); - return ret; -} - -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) -{ - struct btrfsic_stack_frame *sf; - - sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (sf) - sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; - return sf; -} - -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) -{ - BUG_ON(!(NULL == sf || - BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); - kfree(sf); -} - -static noinline_for_stack int btrfsic_process_metablock( - struct btrfsic_state *state, - struct btrfsic_block *const first_block, - struct btrfsic_block_data_ctx *const first_block_ctx, - int first_limit_nesting, int force_iodone_flag) -{ - struct btrfsic_stack_frame initial_stack_frame = { 0 }; - struct btrfsic_stack_frame *sf; - struct btrfsic_stack_frame *next_stack; - struct btrfs_header *const first_hdr = - (struct btrfs_header *)first_block_ctx->datav[0]; - - BUG_ON(!first_hdr); - sf = &initial_stack_frame; - sf->error = 0; - sf->i = -1; - sf->limit_nesting = first_limit_nesting; - sf->block = first_block; - sf->block_ctx = first_block_ctx; - sf->next_block = NULL; - sf->hdr = first_hdr; - sf->prev = NULL; - -continue_with_new_stack_frame: - sf->block->generation = btrfs_stack_header_generation(sf->hdr); - if (0 == sf->hdr->level) { - struct btrfs_leaf *const leafhdr = - (struct btrfs_leaf *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&leafhdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("leaf %llu items %d generation %llu owner %llu\n", - sf->block_ctx->start, sf->nr, - btrfs_stack_header_generation( - &leafhdr->header), - btrfs_stack_header_owner( - &leafhdr->header)); - } - -continue_with_current_leaf_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_item disk_item; - u32 disk_item_offset = - (uintptr_t)(leafhdr->items + sf->i) - - (uintptr_t)leafhdr; - struct btrfs_disk_key *disk_key; - u8 type; - u32 item_offset; - u32 item_size; - - if (disk_item_offset + sizeof(struct btrfs_item) > - sf->block_ctx->len) { -leaf_item_out_of_bounce_error: - pr_info( - "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data(sf->block_ctx, - &disk_item, - disk_item_offset, - sizeof(struct btrfs_item)); - item_offset = btrfs_stack_item_offset(&disk_item); - item_size = btrfs_stack_item_size(&disk_item); - disk_key = &disk_item.key; - type = btrfs_disk_key_type(disk_key); - - if (BTRFS_ROOT_ITEM_KEY == type) { - struct btrfs_root_item root_item; - u32 root_item_offset; - u64 next_bytenr; - - root_item_offset = item_offset + - offsetof(struct btrfs_leaf, items); - if (root_item_offset + item_size > - sf->block_ctx->len) - goto leaf_item_out_of_bounce_error; - btrfsic_read_from_block_data( - sf->block_ctx, &root_item, - root_item_offset, - item_size); - next_bytenr = btrfs_root_bytenr(&root_item); - - sf->error = - btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - disk_key, - btrfs_root_generation( - &root_item)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = - btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - btrfsic_release_block_ctx( - &sf-> - next_block_ctx); - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = - &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - } else if (BTRFS_EXTENT_DATA_KEY == type && - state->include_extent_data) { - sf->error = btrfsic_handle_extent_data( - state, - sf->block, - sf->block_ctx, - item_offset, - force_iodone_flag); - if (sf->error) - goto one_stack_frame_backwards; - } - - goto continue_with_current_leaf_stack_frame; - } - } else { - struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&nodehdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("node %llu level %d items %d generation %llu owner %llu\n", - sf->block_ctx->start, - nodehdr->header.level, sf->nr, - btrfs_stack_header_generation( - &nodehdr->header), - btrfs_stack_header_owner( - &nodehdr->header)); - } - -continue_with_current_node_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_key_ptr key_ptr; - u32 key_ptr_offset; - u64 next_bytenr; - - key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - - (uintptr_t)nodehdr; - if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > - sf->block_ctx->len) { - pr_info( - "btrfsic: node item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data( - sf->block_ctx, &key_ptr, key_ptr_offset, - sizeof(struct btrfs_key_ptr)); - next_bytenr = btrfs_stack_key_blockptr(&key_ptr); - - sf->error = btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - &key_ptr.key, - btrfs_stack_key_generation(&key_ptr)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - - goto continue_with_current_node_stack_frame; - } - } - -one_stack_frame_backwards: - if (NULL != sf->prev) { - struct btrfsic_stack_frame *const prev = sf->prev; - - /* the one for the initial block is freed in the caller */ - btrfsic_release_block_ctx(sf->block_ctx); - - if (sf->error) { - prev->error = sf->error; - btrfsic_stack_frame_free(sf); - sf = prev; - goto one_stack_frame_backwards; - } - - btrfsic_stack_frame_free(sf); - sf = prev; - goto continue_with_new_stack_frame; - } else { - BUG_ON(&initial_stack_frame != sf); - } - - return sf->error; -} - -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dstv, u32 offset, size_t len) -{ - size_t cur; - size_t pgoff; - char *kaddr; - char *dst = (char *)dstv; - size_t start_offset = offset_in_page(block_ctx->start); - unsigned long i = (start_offset + offset) >> PAGE_SHIFT; - - WARN_ON(offset + len > block_ctx->len); - pgoff = offset_in_page(start_offset + offset); - - while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - pgoff)); - BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); - kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + pgoff, cur); - - dst += cur; - len -= cur; - pgoff = 0; - i++; - } -} - -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block *next_block = NULL; - int ret; - struct btrfsic_block_link *l; - int did_alloc_block_link; - int block_was_created; - - *next_blockp = NULL; - if (0 == *num_copiesp) { - *num_copiesp = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, *num_copiesp); - *mirror_nump = 1; - } - - if (*mirror_nump > *num_copiesp) - return 0; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n", - *mirror_nump); - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - next_block_ctx, *mirror_nump); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, *mirror_nump); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - next_block = btrfsic_block_lookup_or_add(state, - next_block_ctx, "referenced ", - 1, force_iodone_flag, - !force_iodone_flag, - *mirror_nump, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - if (block_was_created) { - l = NULL; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block), - next_block->logical_bytenr); - else - pr_info( - "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block)); - } - next_block->logical_bytenr = next_bytenr; - - next_block->mirror_num = *mirror_nump; - l = btrfsic_block_link_hashtable_lookup( - next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_link_hashtable); - } - - next_block->disk_key = *disk_key; - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - did_alloc_block_link = 1; - l->block_ref_to = next_block; - l->block_ref_from = block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - did_alloc_block_link = 0; - if (0 == limit_nesting) { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - } - - if (limit_nesting > 0 && did_alloc_block_link) { - ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)next_block_ctx->len) { - pr_info("btrfsic: read block @logical %llu failed!\n", - next_bytenr); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - *next_blockp = next_block; - } else { - *next_blockp = NULL; - } - (*mirror_nump)++; - - return 0; -} - -static int btrfsic_handle_extent_data( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_file_extent_item file_extent_item; - u64 file_extent_item_offset; - u64 next_bytenr; - u64 num_bytes; - u64 generation; - struct btrfsic_block_link *l; - int ret; - - file_extent_item_offset = offsetof(struct btrfs_leaf, items) + - item_offset; - if (file_extent_item_offset + - offsetof(struct btrfs_file_extent_item, disk_num_bytes) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - offsetof(struct btrfs_file_extent_item, disk_num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr( - &file_extent_item)); - return 0; - } - - if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); - if (btrfs_stack_file_extent_compression(&file_extent_item) == - BTRFS_COMPRESS_NONE) { - next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); - } else { - num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); - } - generation = btrfs_stack_file_extent_generation(&file_extent_item); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr(&file_extent_item), - btrfs_stack_file_extent_offset(&file_extent_item), - num_bytes); - while (num_bytes > 0) { - u32 chunk_len; - int num_copies; - int mirror_num; - - if (num_bytes > state->datablock_size) - chunk_len = state->datablock_size; - else - chunk_len = num_bytes; - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->datablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfsic_block *next_block; - int block_was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n", - mirror_num); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("\tdisk_bytenr = %llu, num_bytes %u\n", - next_bytenr, chunk_len); - ret = btrfsic_map_block(state, next_bytenr, - chunk_len, &next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &next_block_ctx, - "referenced ", - 0, - force_iodone_flag, - !force_iodone_flag, - mirror_num, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&next_block_ctx); - return -1; - } - if (!block_was_created) { - if ((state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) && - next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", - next_bytenr, - next_block_ctx.dev->bdev, - next_block_ctx.dev_bytenr, - mirror_num, - next_block->logical_bytenr); - } - next_block->logical_bytenr = next_bytenr; - next_block->mirror_num = mirror_num; - } - - l = btrfsic_block_link_lookup_or_add(state, - &next_block_ctx, - next_block, block, - generation); - btrfsic_release_block_ctx(&next_block_ctx); - if (NULL == l) - return -1; - } - - next_bytenr += chunk_len; - num_bytes -= chunk_len; - } - - return 0; -} - -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int ret; - u64 length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap, *map; - struct btrfs_device *device; - - length = len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, - NULL, &mirror_num, 0); - if (ret) { - block_ctx_out->start = 0; - block_ctx_out->dev_bytenr = 0; - block_ctx_out->len = 0; - block_ctx_out->dev = NULL; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - return ret; - } - - if (bioc) - map = &bioc->stripes[0]; - else - map = &smap; - - device = map->dev; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev || !device->name) - block_ctx_out->dev = NULL; - else - block_ctx_out->dev = btrfsic_dev_state_lookup( - device->bdev->bd_dev); - block_ctx_out->dev_bytenr = map->physical; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - kfree(bioc); - if (NULL == block_ctx_out->dev) { - ret = -ENXIO; - pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); - } - - return ret; -} - -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) -{ - if (block_ctx->mem_to_free) { - unsigned int num_pages; - - BUG_ON(!block_ctx->datav); - BUG_ON(!block_ctx->pagev); - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - /* Pages must be unmapped in reverse order */ - while (num_pages > 0) { - num_pages--; - if (block_ctx->datav[num_pages]) - block_ctx->datav[num_pages] = NULL; - if (block_ctx->pagev[num_pages]) { - __free_page(block_ctx->pagev[num_pages]); - block_ctx->pagev[num_pages] = NULL; - } - } - - kfree(block_ctx->mem_to_free); - block_ctx->mem_to_free = NULL; - block_ctx->pagev = NULL; - block_ctx->datav = NULL; - } -} - -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx) -{ - unsigned int num_pages; - unsigned int i; - size_t size; - u64 dev_bytenr; - int ret; - - BUG_ON(block_ctx->datav); - BUG_ON(block_ctx->pagev); - BUG_ON(block_ctx->mem_to_free); - if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { - pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", - block_ctx->dev_bytenr); - return -1; - } - - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); - block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); - if (!block_ctx->mem_to_free) - return -ENOMEM; - block_ctx->datav = block_ctx->mem_to_free; - block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); - if (ret) - return ret; - - dev_bytenr = block_ctx->dev_bytenr; - for (i = 0; i < num_pages;) { - struct bio *bio; - unsigned int j; - - bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, - REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; - - for (j = i; j < num_pages; j++) { - ret = bio_add_page(bio, block_ctx->pagev[j], - PAGE_SIZE, 0); - if (PAGE_SIZE != ret) - break; - } - if (j == i) { - pr_info("btrfsic: error, failed to add a single page!\n"); - return -1; - } - if (submit_bio_wait(bio)) { - pr_info("btrfsic: read error at logical %llu dev %pg!\n", - block_ctx->start, block_ctx->dev->bdev); - bio_put(bio); - return -1; - } - bio_put(bio); - dev_bytenr += (j - i) * PAGE_SIZE; - i = j; - } - for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = page_address(block_ctx->pagev[i]); - - return block_ctx->len; -} - -static void btrfsic_dump_database(struct btrfsic_state *state) -{ - const struct btrfsic_block *b_all; - - BUG_ON(NULL == state); - - pr_info("all_blocks_list:\n"); - list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { - const struct btrfsic_block_link *l; - - pr_info("%c-block @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - - list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { - pr_info( - " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - } - - list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { - pr_info( - " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - } - - pr_info("\n"); - } -} - -/* - * Test whether the disk block contains a tree block (leaf or node) - * (note that this test fails for the super block) - */ -static noinline_for_stack int btrfsic_test_for_metadata( - struct btrfsic_state *state, - char **datav, unsigned int num_pages) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct btrfs_header *h; - u8 csum[BTRFS_CSUM_SIZE]; - unsigned int i; - - if (num_pages * PAGE_SIZE < state->metablock_size) - return 1; /* not metadata */ - num_pages = state->metablock_size >> PAGE_SHIFT; - h = (struct btrfs_header *)datav[0]; - - if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) - return 1; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - - for (i = 0; i < num_pages; i++) { - u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); - size_t sublen = i ? PAGE_SIZE : - (PAGE_SIZE - BTRFS_CSUM_SIZE); - - crypto_shash_update(shash, data, sublen); - } - crypto_shash_final(shash, csum); - if (memcmp(csum, h->csum, fs_info->csum_size)) - return 1; - - return 0; /* is metadata */ -} - -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char **mapped_datav, - unsigned int num_pages, - struct bio *bio, int *bio_is_patched, - blk_opf_t submit_bio_bh_rw) -{ - int is_metadata; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx block_ctx; - int ret; - struct btrfsic_state *state = dev_state->state; - struct block_device *bdev = dev_state->bdev; - unsigned int processed_len; - - if (NULL != bio_is_patched) - *bio_is_patched = 0; - -again: - if (num_pages == 0) - return; - - processed_len = 0; - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, - num_pages)); - - block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, - &state->block_hashtable); - if (NULL != block) { - u64 bytenr = 0; - struct btrfsic_block_link *l, *tmp; - - if (block->is_superblock) { - bytenr = btrfs_super_bytenr((struct btrfs_super_block *) - mapped_datav[0]); - if (num_pages * PAGE_SIZE < - BTRFS_SUPER_INFO_SIZE) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - is_metadata = 1; - BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); - processed_len = BTRFS_SUPER_INFO_SIZE; - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { - pr_info("[before new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } - if (is_metadata) { - if (!block->is_superblock) { - if (num_pages * PAGE_SIZE < - state->metablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, - dev_state, - dev_bytenr); - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (block->logical_bytenr != bytenr && - !(!block->is_metadata && - block->logical_bytenr == 0)) - pr_info( -"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - bytenr, dev_state->bdev, - dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, - block), - block->logical_bytenr); - else - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, - dev_bytenr, block->mirror_num, - btrfsic_get_block_type(state, - block)); - } - block->logical_bytenr = bytenr; - } else { - if (num_pages * PAGE_SIZE < - state->datablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->datablock_size; - bytenr = block->logical_bytenr; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("ref_to_list: %cE, ref_from_list: %cE\n", - list_empty(&block->ref_to_list) ? ' ' : '!', - list_empty(&block->ref_from_list) ? ' ' : '!'); - if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_disk_key_objectid(&block->disk_key), - block->disk_key.type, - btrfs_disk_key_offset(&block->disk_key), - btrfs_stack_header_generation( - (struct btrfs_header *) mapped_datav[0]), - state->max_superblock_generation); - btrfsic_dump_tree(state); - } - - if (!block->is_iodone && !block->never_written) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_stack_header_generation( - (struct btrfs_header *) - mapped_datav[0])); - /* it would not be safe to go on */ - btrfsic_dump_tree(state); - goto continue_loop; - } - - /* - * Clear all references of this block. Do not free - * the block itself even if is not referenced anymore - * because it still carries valuable information - * like whether it was ever written and IO completed. - */ - list_for_each_entry_safe(l, tmp, &block->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - l->ref_cnt--; - if (0 == l->ref_cnt) { - list_del(&l->node_ref_to); - list_del(&l->node_ref_from); - btrfsic_block_link_hashtable_remove(l); - btrfsic_block_link_free(l); - } - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - if (is_metadata || state->include_extent_data) { - block->never_written = 0; - block->iodone_w_error = 0; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = - bio->bi_private; - block->orig_bio_end_io = - bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - } - - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (is_metadata) { - block->logical_bytenr = bytenr; - block->is_metadata = 1; - if (block->is_superblock) { - BUG_ON(PAGE_SIZE != - BTRFS_SUPER_INFO_SIZE); - ret = btrfsic_process_written_superblock( - state, - block, - (struct btrfs_super_block *) - mapped_datav[0]); - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { - pr_info("[after new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } else { - block->mirror_num = 0; /* unknown */ - ret = btrfsic_process_metablock( - state, - block, - &block_ctx, - 0, 0); - } - if (ret) - pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n", - dev_bytenr); - } else { - block->is_metadata = 0; - block->mirror_num = 0; /* unknown */ - block->generation = BTRFSIC_GENERATION_UNKNOWN; - if (!state->include_extent_data - && list_empty(&block->ref_from_list)) { - /* - * disk block is overwritten with extent - * data (not meta data) and we are configured - * to not include extent data: take the - * chance and free the block's memory - */ - btrfsic_block_hashtable_remove(block); - list_del(&block->all_blocks_node); - btrfsic_block_free(block); - } - } - btrfsic_release_block_ctx(&block_ctx); - } else { - /* block has not been found in hash table */ - u64 bytenr; - - if (!is_metadata) { - processed_len = state->datablock_size; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block (%pg/%llu/?) !found in hash table, D\n", - dev_state->bdev, dev_bytenr); - if (!state->include_extent_data) { - /* ignore that written D block */ - goto continue_loop; - } - - /* this is getting ugly for the - * include_extent_data case... */ - bytenr = 0; /* unknown */ - } else { - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/?) !found in hash table, M\n", - bytenr, dev_state->bdev, dev_bytenr); - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - block = btrfsic_block_alloc(); - if (NULL == block) { - btrfsic_release_block_ctx(&block_ctx); - goto continue_loop; - } - block->dev_state = dev_state; - block->dev_bytenr = dev_bytenr; - block->logical_bytenr = bytenr; - block->is_metadata = is_metadata; - block->never_written = 0; - block->iodone_w_error = 0; - block->mirror_num = 0; /* unknown */ - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", - is_metadata ? 'M' : 'D', - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - - if (is_metadata) { - ret = btrfsic_process_metablock(state, block, - &block_ctx, 0, 0); - if (ret) - pr_info("btrfsic: process_metablock(root @%llu) failed!\n", - dev_bytenr); - } - btrfsic_release_block_ctx(&block_ctx); - } - -continue_loop: - BUG_ON(!processed_len); - dev_bytenr += processed_len; - mapped_datav += processed_len >> PAGE_SHIFT; - num_pages -= processed_len >> PAGE_SHIFT; - goto again; -} - -static void btrfsic_bio_end_io(struct bio *bp) -{ - struct btrfsic_block *block = bp->bi_private; - int iodone_w_error; - - /* mutex is not held! This is not save if IO is not yet completed - * on umount */ - iodone_w_error = 0; - if (bp->bi_status) - iodone_w_error = 1; - - BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_private; - bp->bi_end_io = block->orig_bio_end_io; - - do { - struct btrfsic_block *next_block; - struct btrfsic_dev_state *const dev_state = block->dev_state; - - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", - bp->bi_status, - btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, block->mirror_num); - next_block = block->next_in_same_bio; - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_PREFLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io() new %pg flush_gen=%llu\n", - dev_state->bdev, - dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is - * on disk */ - block->is_iodone = 1; /* for FLUSH, this releases the block */ - block = next_block; - } while (NULL != block); - - bp->bi_end_io(bp); -} - -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const superblock, - struct btrfs_super_block *const super_hdr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int pass; - - superblock->generation = btrfs_super_generation(super_hdr); - if (!(superblock->generation > state->max_superblock_generation || - 0 == state->max_superblock_generation)) { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - - state->max_superblock_generation = - btrfs_super_generation(super_hdr); - state->latest_superblock = superblock; - } - - for (pass = 0; pass < 3; pass++) { - int ret; - u64 next_bytenr; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key = {0}; - - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_ITEM_KEY); - btrfs_set_disk_key_objectid(&tmp_disk_key, 0); - - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "root "; - next_bytenr = btrfs_super_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "chunk "; - next_bytenr = btrfs_super_chunk_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "log "; - next_bytenr = btrfs_super_log_root(super_hdr); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - BTRFS_SUPER_INFO_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - int was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, - BTRFS_SUPER_INFO_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &tmp_next_block_ctx, - additional_string, - 1, 0, 1, - mirror_num, - &was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - return -1; - } - - next_block->disk_key = tmp_disk_key; - if (was_created) - next_block->generation = - BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, - &tmp_next_block_ctx, - next_block, - superblock, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) - return -1; - } - } - - if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) - btrfsic_dump_tree(state); - - return 0; -} - -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - int ret = 0; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* - * Note that this situation can happen and does not - * indicate an error in regular cases. It happens - * when disk blocks are freed and later reused. - * The check-integrity module is not aware of any - * block free operations, it just recognizes block - * write operations. Therefore it keeps the linkage - * information for a block until a block is - * rewritten. This can temporarily cause incorrect - * and even circular linkage information. This - * causes no harm unless such blocks are referenced - * by the most recent super block. - */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 1).\n"); - - return ret; - } - - /* - * This algorithm is recursive because the amount of used stack - * space is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - if (l->block_ref_to->never_written) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (!l->block_ref_to->is_iodone) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->block_ref_to->iodone_w_error) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->parent_generation != - l->block_ref_to->generation && - BTRFSIC_GENERATION_UNKNOWN != - l->parent_generation && - BTRFSIC_GENERATION_UNKNOWN != - l->block_ref_to->generation) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - l->block_ref_to->generation, - l->parent_generation); - ret = -1; - } else if (l->block_ref_to->flush_gen > - l->block_ref_to->dev_state->last_flush_gen) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, block->flush_gen, - l->block_ref_to->dev_state->last_flush_gen); - ret = -1; - } else if (-1 == btrfsic_check_all_ref_blocks(state, - l->block_ref_to, - recursion_level + - 1)) { - ret = -1; - } - } - - return ret; -} - -static int btrfsic_is_block_ref_by_superblock( - const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* refer to comment at "abort cyclic linkage (case 1)" */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 2).\n"); - - return 0; - } - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_from_list, node_ref_from) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - if (l->block_ref_from->is_superblock && - state->latest_superblock->dev_bytenr == - l->block_ref_from->dev_bytenr && - state->latest_superblock->dev_state->bdev == - l->block_ref_from->dev_state->bdev) - return 1; - else if (btrfsic_is_block_ref_by_superblock(state, - l->block_ref_from, - recursion_level + - 1)) - return 1; - } - - return 0; -} - -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block) -{ - if (block->is_superblock && - state->latest_superblock->dev_bytenr == block->dev_bytenr && - state->latest_superblock->dev_state->bdev == block->dev_state->bdev) - return 'S'; - else if (block->is_superblock) - return 's'; - else if (block->is_metadata) - return 'M'; - else - return 'D'; -} - -static void btrfsic_dump_tree(const struct btrfsic_state *state) -{ - btrfsic_dump_tree_sub(state, state->latest_superblock, 0); -} - -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level) -{ - const struct btrfsic_block_link *l; - int indent_add; - static char buf[80]; - int cursor_position; - - /* - * Should better fill an on-stack buffer with a complete line and - * dump it at once when it is time to print a newline character. - */ - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - return; - } - printk(buf); - indent_level += indent_add; - if (list_empty(&block->ref_to_list)) { - printk("\n"); - return; - } - if (block->mirror_num > 1 && - !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { - printk(" [...]\n"); - return; - } - - cursor_position = indent_level; - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - while (cursor_position < indent_level) { - printk(" "); - cursor_position++; - } - if (l->ref_cnt > 1) - indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); - else - indent_add = sprintf(buf, " --> "); - if (indent_level + indent_add > - BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - cursor_position = 0; - continue; - } - - printk(buf); - - btrfsic_dump_tree_sub(state, l->block_ref_to, - indent_level + indent_add); - cursor_position = 0; - } -} - -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation) -{ - struct btrfsic_block_link *l; - - l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - from_block->dev_state->bdev, - from_block->dev_bytenr, - &state->block_link_hashtable); - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (!l) - return NULL; - - l->block_ref_to = next_block; - l->block_ref_from = from_block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &from_block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - - return l; -} - -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created) -{ - struct btrfsic_block *block; - - block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_hashtable); - if (NULL == block) { - struct btrfsic_dev_state *dev_state; - - block = btrfsic_block_alloc(); - if (!block) - return NULL; - - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); - if (NULL == dev_state) { - pr_info("btrfsic: error, lookup dev_state failed!\n"); - btrfsic_block_free(block); - return NULL; - } - block->dev_state = dev_state; - block->dev_bytenr = block_ctx->dev_bytenr; - block->logical_bytenr = block_ctx->start; - block->is_metadata = is_metadata; - block->is_iodone = is_iodone; - block->never_written = never_written; - block->mirror_num = mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", - additional_string, - btrfsic_get_block_type(state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - if (NULL != was_created) - *was_created = 1; - } else { - if (NULL != was_created) - *was_created = 0; - } - - return block; -} - -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block_data_ctx block_ctx; - int num_copies; - int mirror_num; - int match = 0; - int ret; - - num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, state->metablock_size, - &block_ctx, mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n", - bytenr, mirror_num); - continue; - } - - if (dev_state->bdev == block_ctx.dev->bdev && - dev_bytenr == block_ctx.dev_bytenr) { - match++; - btrfsic_release_block_ctx(&block_ctx); - break; - } - btrfsic_release_block_ctx(&block_ctx); - } - - if (WARN_ON(!match)) { - pr_info( -"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", - bytenr, dev_state->bdev, dev_bytenr); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, - state->metablock_size, - &block_ctx, mirror_num); - if (ret) - continue; - - pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", - bytenr, block_ctx.dev->bdev, - block_ctx.dev_bytenr, mirror_num); - } - } -} - -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) -{ - return btrfsic_dev_state_hashtable_lookup(dev, - &btrfsic_dev_state_hashtable); -} - -static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - unsigned int segs = bio_segments(bio); - u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; - u64 cur_bytenr = dev_bytenr; - struct bvec_iter iter; - struct bio_vec bvec; - char **mapped_datav; - int bio_is_patched = 0; - int i = 0; - - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info( -"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - return; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - - btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, - bio, &bio_is_patched, bio->bi_opf); - kfree(mapped_datav); -} - -static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - - if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } else if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) { - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } -} - -void btrfsic_check_bio(struct bio *bio) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return; - - /* - * We can be called before btrfsic_mount, so there might not be a - * dev_state. - */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - mutex_lock(&btrfsic_mutex); - if (dev_state) { - if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) - btrfsic_check_write_bio(bio, dev_state); - else if (bio->bi_opf & REQ_PREFLUSH) - btrfsic_check_flush_bio(bio, dev_state); - } - mutex_unlock(&btrfsic_mutex); -} - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask) -{ - int ret; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!PAGE_ALIGNED(fs_info->nodesize)) { - pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->nodesize, PAGE_SIZE); - return -1; - } - if (!PAGE_ALIGNED(fs_info->sectorsize)) { - pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->sectorsize, PAGE_SIZE); - return -1; - } - state = kvzalloc(sizeof(*state), GFP_KERNEL); - if (!state) - return -ENOMEM; - - if (!btrfsic_is_initialized) { - mutex_init(&btrfsic_mutex); - btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); - btrfsic_is_initialized = 1; - } - mutex_lock(&btrfsic_mutex); - state->fs_info = fs_info; - state->print_mask = print_mask; - state->include_extent_data = including_extent_data; - state->metablock_size = fs_info->nodesize; - state->datablock_size = fs_info->sectorsize; - INIT_LIST_HEAD(&state->all_blocks_list); - btrfsic_block_hashtable_init(&state->block_hashtable); - btrfsic_block_link_hashtable_init(&state->block_link_hashtable); - state->max_superblock_generation = 0; - state->latest_superblock = NULL; - - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_alloc(); - if (NULL == ds) { - mutex_unlock(&btrfsic_mutex); - return -ENOMEM; - } - ds->bdev = device->bdev; - ds->state = state; - btrfsic_dev_state_hashtable_add(ds, - &btrfsic_dev_state_hashtable); - } - - ret = btrfsic_process_superblock(state, fs_devices); - if (0 != ret) { - mutex_unlock(&btrfsic_mutex); - btrfsic_unmount(fs_devices); - return ret; - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) - btrfsic_dump_database(state); - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) - btrfsic_dump_tree(state); - - mutex_unlock(&btrfsic_mutex); - return 0; -} - -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) -{ - struct btrfsic_block *b_all, *tmp_all; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!btrfsic_is_initialized) - return; - - mutex_lock(&btrfsic_mutex); - - state = NULL; - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_hashtable_lookup( - device->bdev->bd_dev, - &btrfsic_dev_state_hashtable); - if (NULL != ds) { - state = ds->state; - btrfsic_dev_state_hashtable_remove(ds); - btrfsic_dev_state_free(ds); - } - } - - if (NULL == state) { - pr_info("btrfsic: error, cannot find state information on umount!\n"); - mutex_unlock(&btrfsic_mutex); - return; - } - - /* - * Don't care about keeping the lists' state up to date, - * just free all memory that was allocated dynamically. - * Free the blocks and the block_links. - */ - list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, - all_blocks_node) { - struct btrfsic_block_link *l, *tmp; - - list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - - l->ref_cnt--; - if (0 == l->ref_cnt) - btrfsic_block_link_free(l); - } - - if (b_all->is_iodone || b_all->never_written) - btrfsic_block_free(b_all); - else - pr_info( -"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - } - - mutex_unlock(&btrfsic_mutex); - - kvfree(state); -} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h deleted file mode 100644 index e4c8aed7996f..000000000000 --- a/fs/btrfs/check-integrity.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -#ifndef BTRFS_CHECK_INTEGRITY_H -#define BTRFS_CHECK_INTEGRITY_H - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_check_bio(struct bio *bio); -#else -static inline void btrfsic_check_bio(struct bio *bio) { } -#endif - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask); -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices); - -#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8818ed5c390f..19b22b4653c8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -193,12 +193,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); + const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (errno) - mapping_set_error(inode->i_mapping, errno); + if (error) + mapping_set_error(inode->i_mapping, error); folio_batch_init(&fbatch); while (index <= end_index) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a4cb4b642987..35c1d24d4a78 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p) * cause could be a bug, eg. due to ENOSPC, and not for common errors that are * caused by external factors. */ -bool __cold abort_should_print_stack(int errno) +bool __cold abort_should_print_stack(int error) { - switch (errno) { + switch (error) { case -EIO: case -EROFS: case -ENOMEM: @@ -316,6 +316,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int ret = 0; int level; struct btrfs_disk_key disk_key; + u64 reloc_src_root = 0; WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); @@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + reloc_src_root = btrfs_header_owner(buf); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, &disk_key, level, buf->start, 0, - BTRFS_NESTING_NEW_ROOT); + reloc_src_root, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -359,7 +362,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return ret; } - btrfs_mark_buffer_dirty(cow); + btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } @@ -367,7 +370,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -int btrfs_block_can_be_shared(struct btrfs_root *root, +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf) { /* @@ -376,11 +380,21 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * not allocated by tree relocation, we know the block is not shared. */ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - buf != root->node && buf != root->commit_root && + buf != root->node && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) - return 1; + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { + if (buf != root->commit_root) + return 1; + /* + * An extent buffer that used to be the commit root may still be + * shared because the tree height may have increased and it + * became a child of a higher level root. This can happen when + * snapshotting a subvolume created in the current transaction. + */ + if (btrfs_header_generation(buf) == trans->transid) + return 1; + } return 0; } @@ -415,10 +429,10 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * are only allowed for blocks use full backrefs. */ - if (btrfs_block_can_be_shared(root, buf)) { + if (btrfs_block_can_be_shared(trans, root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, - &refs, &flags); + &refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) { @@ -507,13 +521,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. */ -static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - enum btrfs_lock_nesting nest) +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -522,6 +536,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, int last_ref = 0; int unlock_orig = 0; u64 parent_start = 0; + u64 reloc_src_root = 0; if (*cow_ret == buf) unlock_orig = 1; @@ -540,12 +555,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) - parent_start = parent->start; - + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + reloc_src_root = btrfs_header_owner(buf); + } cow = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, &disk_key, level, - search_start, empty_size, nest); + search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -616,7 +633,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { @@ -632,7 +649,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); - btrfs_mark_buffer_dirty(cow); + btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } @@ -668,11 +685,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, } /* - * cows a single block, see __btrfs_cow_block for the real work. + * COWs a single block, see btrfs_force_cow_block() for the real work. * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ -noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, +int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, @@ -682,25 +699,37 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (test_bit(BTRFS_ROOT_DELETING, &root->state)) - btrfs_err(fs_info, - "COW'ing blocks on a fs root that's being dropped"); - - if (trans->transaction != fs_info->running_transaction) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, - fs_info->running_transaction->transid); + if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, + "attempt to COW block %llu on root %llu that is being deleted", + buf->start, btrfs_root_id(root)); + return -EUCLEAN; + } - if (trans->transid != fs_info->generation) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", + buf->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; return 0; } - search_start = buf->start & ~((u64)SZ_1G - 1); + search_start = round_down(buf->start, SZ_1G); /* * Before CoWing this block for later modification, check if it's @@ -709,8 +738,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, nest); + ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -719,49 +748,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* - * helper function for defrag to decide if two blocks pointed to by a - * node are actually close by - */ -static int close_blocks(u64 blocknr, u64 other, u32 blocksize) -{ - if (blocknr < other && other - (blocknr + blocksize) < 32768) - return 1; - if (blocknr > other && blocknr - (other + blocksize) < 32768) - return 1; - return 0; -} - -#ifdef __LITTLE_ENDIAN - -/* - * Compare two keys, on little-endian the disk order is same as CPU order and - * we can avoid the conversion. - */ -static int comp_keys(const struct btrfs_disk_key *disk_key, - const struct btrfs_key *k2) -{ - const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; - - return btrfs_comp_cpu_keys(k1, k2); -} - -#else - -/* - * compare two keys in a memcmp fashion - */ -static int comp_keys(const struct btrfs_disk_key *disk, - const struct btrfs_key *k2) -{ - struct btrfs_key k1; - - btrfs_disk_key_to_cpu(&k1, disk); - - return btrfs_comp_cpu_keys(&k1, k2); -} -#endif - -/* * same as comp_keys only with two btrfs_key's */ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) @@ -782,91 +768,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke } /* - * this is used by the defrag code to go through all the - * leaves pointed to by a node and reallocate them so that - * disk order is close to key order - */ -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *cur; - u64 blocknr; - u64 search_start = *last_ret; - u64 last_block = 0; - u64 other; - u32 parent_nritems; - int end_slot; - int i; - int err = 0; - u32 blocksize; - int progress_passed = 0; - struct btrfs_disk_key disk_key; - - WARN_ON(trans->transaction != fs_info->running_transaction); - WARN_ON(trans->transid != fs_info->generation); - - parent_nritems = btrfs_header_nritems(parent); - blocksize = fs_info->nodesize; - end_slot = parent_nritems - 1; - - if (parent_nritems <= 1) - return 0; - - for (i = start_slot; i <= end_slot; i++) { - int close = 1; - - btrfs_node_key(parent, &disk_key, i); - if (!progress_passed && comp_keys(&disk_key, progress) < 0) - continue; - - progress_passed = 1; - blocknr = btrfs_node_blockptr(parent, i); - if (last_block == 0) - last_block = blocknr; - - if (i > 0) { - other = btrfs_node_blockptr(parent, i - 1); - close = close_blocks(blocknr, other, blocksize); - } - if (!close && i < end_slot) { - other = btrfs_node_blockptr(parent, i + 1); - close = close_blocks(blocknr, other, blocksize); - } - if (close) { - last_block = blocknr; - continue; - } - - cur = btrfs_read_node_slot(parent, i); - if (IS_ERR(cur)) - return PTR_ERR(cur); - if (search_start == 0) - search_start = last_block; - - btrfs_tree_lock(cur); - err = __btrfs_cow_block(trans, root, cur, parent, i, - &cur, search_start, - min(16 * blocksize, - (end_slot - i) * blocksize), - BTRFS_NESTING_COW); - if (err) { - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - break; - } - search_start = cur->start; - last_block = cur->start; - *last_ret = search_start; - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - } - return err; -} - -/* * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. @@ -932,7 +833,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, tmp = &unaligned; } - ret = comp_keys(tmp, key); + ret = btrfs_comp_keys(tmp, key); if (ret < 0) low = mid + 1; @@ -947,19 +848,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, return 1; } -static void root_add_used(struct btrfs_root *root, u32 size) +static void root_add_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) + size); + btrfs_root_used(&root->root_item) + root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } -static void root_sub_used(struct btrfs_root *root, u32 size) +static void root_sub_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) - size); + btrfs_root_used(&root->root_item) - root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } @@ -1075,7 +976,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); - root_sub_used(root, mid->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); @@ -1145,7 +1046,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; goto out; } - root_sub_used(root, right->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); free_extent_buffer_stale(right); @@ -1160,7 +1061,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } btrfs_set_node_key(parent, &right_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } } if (btrfs_header_nritems(mid) == 1) { @@ -1203,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = NULL; goto out; } - root_sub_used(root, mid->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; @@ -1218,7 +1119,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } btrfs_set_node_key(parent, &mid_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } /* update the path */ @@ -1325,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return ret; } btrfs_set_node_key(parent, &disk_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(left) > orig_slot) { path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -1385,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return ret; } btrfs_set_node_key(parent, &disk_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(mid) <= orig_slot) { path->nodes[level] = right; @@ -1969,7 +1870,7 @@ static int search_leaf(struct btrfs_trans_handle *trans, * the extent buffer's header and we have recently accessed * the header's level field. */ - ret = comp_keys(&first_key, key); + ret = btrfs_comp_keys(&first_key, key); if (ret < 0) { /* * The first key is smaller than the key we want @@ -2054,8 +1955,8 @@ static int search_leaf(struct btrfs_trans_handle *trans, } /* - * btrfs_search_slot - look for a key in a tree and perform necessary - * modifications to preserve tree invariants. + * Look for a key in a tree and perform necessary modifications to preserve + * tree invariants. * * @trans: Handle of transaction, used when modifying the tree * @p: Holds all btree nodes along the search path @@ -2478,7 +2379,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); + ret = btrfs_comp_keys(&found_key, &orig_key); if (ret == 0) { if (path->slots[0] > 0) { path->slots[0]--; @@ -2493,7 +2394,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) } btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); + ret = btrfs_comp_keys(&found_key, &key); /* * We might have had an item with the previous key in the tree right * before we released our path. And after we released our path, that @@ -2641,7 +2542,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * higher levels * */ -static void fixup_low_keys(struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -2658,7 +2560,7 @@ static void fixup_low_keys(struct btrfs_path *path, BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); - btrfs_mark_buffer_dirty(path->nodes[i]); + btrfs_mark_buffer_dirty(trans, path->nodes[i]); if (tslot != 0) break; } @@ -2670,10 +2572,11 @@ static void fixup_low_keys(struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *new_key) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *eb; int slot; @@ -2682,7 +2585,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2696,7 +2599,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2711,9 +2614,9 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(eb, &disk_key, slot); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); if (slot == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* @@ -2844,8 +2747,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, } btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -2920,8 +2823,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -2937,7 +2840,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 lower_gen; struct extent_buffer *lower; struct extent_buffer *c; @@ -2956,11 +2858,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, &lower_key, level, root->node->start, 0, - BTRFS_NESTING_NEW_ROOT); + 0, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); btrfs_set_header_nritems(c, 1); btrfs_set_node_key(c, &lower_key, 0); @@ -2970,7 +2872,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(c, 0, lower_gen); - btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(trans, c); old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); @@ -3042,7 +2944,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); - btrfs_mark_buffer_dirty(lower); + btrfs_mark_buffer_dirty(trans, lower); return 0; } @@ -3100,11 +3002,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, &disk_key, level, c->start, 0, - BTRFS_NESTING_SPLIT); + 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); @@ -3121,8 +3023,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); - btrfs_mark_buffer_dirty(c); - btrfs_mark_buffer_dirty(split); + btrfs_mark_buffer_dirty(trans, c); + btrfs_mark_buffer_dirty(trans, split); ret = insert_ptr(trans, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); @@ -3288,15 +3190,15 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, left_nritems); if (left_nritems) - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); else btrfs_clear_buffer_dirty(trans, left); - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); - btrfs_mark_buffer_dirty(upper); + btrfs_mark_buffer_dirty(trans, upper); /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { @@ -3508,14 +3410,14 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_token_item_offset(&token, i, push_space); } - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); if (right_nritems) - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); else btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -3646,8 +3548,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, right); + btrfs_mark_buffer_dirty(trans, l); BUG_ON(path->slots[0] != slot); if (mid <= slot) { @@ -3851,13 +3753,13 @@ again: * use BTRFS_NESTING_NEW_ROOT. */ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, + &disk_key, 0, l->start, 0, 0, num_doubles ? BTRFS_NESTING_NEW_ROOT : BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); if (split == 0) { if (mid <= slot) { @@ -3888,7 +3790,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* * We create a new leaf 'right' for the required ins_len and @@ -3987,7 +3889,8 @@ err: return ret; } -static noinline int split_item(struct btrfs_path *path, +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset) { @@ -4046,7 +3949,7 @@ static noinline int split_item(struct btrfs_path *path, write_extent_buffer(leaf, buf + split_offset, btrfs_item_ptr_offset(leaf, slot), item_size - split_offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); BUG_ON(btrfs_leaf_free_space(leaf) < 0); kfree(buf); @@ -4080,7 +3983,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(path, new_key, split_offset); + ret = split_item(trans, path, new_key, split_offset); return ret; } @@ -4090,7 +3993,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4166,11 +4070,11 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } btrfs_set_item_size(leaf, slot, new_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4181,7 +4085,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) /* * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_path *path, u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; @@ -4231,7 +4136,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) data_end = old_data; old_size = btrfs_item_size(leaf, slot); btrfs_set_item_size(leaf, slot, old_size + data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4242,6 +4147,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) /* * Make space in the node before inserting one or more items. * + * @trans: transaction handle * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items * @batch: information about the batch of items to insert @@ -4249,7 +4155,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * Main purpose is to save stack depth by doing the bulk of the work in a * function that doesn't call btrfs_search_slot */ -static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, +static void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4269,7 +4176,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p */ if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4328,7 +4235,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p } btrfs_set_header_nritems(leaf, nritems + batch->nr); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4339,12 +4246,14 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p /* * Insert a new item into a leaf. * + * @trans: Transaction handle. * @root: The root of the btree. * @path: A path pointing to the target leaf and slot. * @key: The key of the new item. * @data_size: The size of the data associated with the new key. */ -void btrfs_setup_item_for_insert(struct btrfs_root *root, +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size) @@ -4356,7 +4265,7 @@ void btrfs_setup_item_for_insert(struct btrfs_root *root, batch.total_data_size = data_size; batch.nr = 1; - setup_items_for_insert(root, path, &batch); + setup_items_for_insert(trans, root, path, &batch); } /* @@ -4382,7 +4291,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, batch); + setup_items_for_insert(trans, root, path, batch); return 0; } @@ -4407,7 +4316,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, data, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } btrfs_free_path(path); return ret; @@ -4438,7 +4347,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - btrfs_setup_item_for_insert(root, path, new_key, item_size); + btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4496,9 +4405,9 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(path, &disk_key, level + 1); + fixup_low_keys(trans, path, &disk_key, level + 1); } - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); return 0; } @@ -4530,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - root_sub_used(root, leaf->len); + root_sub_used_bytes(root); atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); @@ -4595,7 +4504,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* @@ -4660,11 +4569,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * dirtied this buffer */ if (path->nodes[0] == leaf) - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); free_extent_buffer(leaf); } } else { - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } } return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9419f4e37a58..196c005c31f6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,37 +6,10 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include <linux/mm.h> -#include <linux/sched/signal.h> -#include <linux/highmem.h> -#include <linux/fs.h> -#include <linux/rwsem.h> -#include <linux/semaphore.h> -#include <linux/completion.h> -#include <linux/backing-dev.h> -#include <linux/wait.h> -#include <linux/slab.h> -#include <trace/events/btrfs.h> -#include <asm/unaligned.h> #include <linux/pagemap.h> -#include <linux/btrfs.h> -#include <linux/btrfs_tree.h> -#include <linux/workqueue.h> -#include <linux/security.h> -#include <linux/sizes.h> -#include <linux/dynamic_debug.h> -#include <linux/refcount.h> -#include <linux/crc32c.h> -#include <linux/iomap.h> -#include <linux/fscrypt.h> -#include "extent-io-tree.h" -#include "extent_io.h" -#include "extent_map.h" -#include "async-thread.h" -#include "block-rsv.h" #include "locking.h" -#include "misc.h" #include "fs.h" +#include "accessors.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -218,10 +191,22 @@ struct btrfs_root { atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; + /* + * Protected by the 'log_mutex' lock but can be read without holding + * that lock to avoid unnecessary lock contention, in which case it + * should be read using btrfs_get_root_log_transid() except if it's a + * log tree in which case it can be directly accessed. Updates to this + * field should always use btrfs_set_root_log_transid(), except for log + * trees where the field can be updated directly. + */ int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; - /* Just be updated when the commit succeeds. */ + /* + * Just be updated when the commit succeeds. Use + * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit() + * to access this field. + */ int last_log_commit; pid_t log_start_pid; @@ -326,6 +311,9 @@ struct btrfs_root { /* Used only by log trees, when logging csum items */ struct extent_io_tree log_csum_range; + /* Used in simple quotas, track root during relocation. */ + u64 relocation_src_root; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -352,6 +340,26 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root) return root->root_key.objectid; } +static inline int btrfs_get_root_log_transid(const struct btrfs_root *root) +{ + return READ_ONCE(root->log_transid); +} + +static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid) +{ + WRITE_ONCE(root->log_transid, log_transid); +} + +static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root) +{ + return READ_ONCE(root->last_log_commit); +} + +static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id) +{ + WRITE_ONCE(root->last_log_commit, commit_id); +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. @@ -470,30 +478,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) -{ - return crc32c(crc, address, length); -} - -static inline void btrfs_crc32c_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - -static inline u64 btrfs_name_hash(const char *name, int len) -{ - return crc32c((u32)~1, name, len); -} - -/* - * Figure the key offset of an extended inode ref - */ -static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, - int len) -{ - return (u64) crc32c(parent_objectid, name, len); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); @@ -513,12 +497,42 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); + +#ifdef __LITTLE_ENDIAN + +/* + * Compare two keys, on little-endian the disk order is same as CPU order and + * we can avoid the conversion. + */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key, + const struct btrfs_key *k2) +{ + const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; + + return btrfs_comp_cpu_keys(k1, k2); +} + +#else + +/* Compare two keys in a memcmp fashion. */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk, + const struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} + +#endif + int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); -void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); @@ -536,16 +550,26 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest); +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_root *root, +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -void btrfs_extend_item(struct btrfs_path *path, u32 data_size); -void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size); +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -566,10 +590,6 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); @@ -609,7 +629,8 @@ struct btrfs_item_batch { int nr; }; -void btrfs_setup_item_for_insert(struct btrfs_root *root, +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f2ff4cbe8656..5244561e2016 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -338,13 +338,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } /* + * Check if two blocks addresses are close, used by defrag. + */ +static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) + return true; + if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) + return true; + return false; +} + +/* + * Go through all the leaves pointed to by a node and reallocate them so that + * disk order is close to key order. + */ +static int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + const u32 blocksize = fs_info->nodesize; + const int end_slot = btrfs_header_nritems(parent) - 1; + u64 search_start = *last_ret; + u64 last_block = 0; + int ret = 0; + bool progress_passed = false; + + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", + parent->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } + + if (btrfs_header_nritems(parent) <= 1) + return 0; + + for (int i = start_slot; i <= end_slot; i++) { + struct extent_buffer *cur; + struct btrfs_disk_key disk_key; + u64 blocknr; + u64 other; + bool close = true; + + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = true; + blocknr = btrfs_node_blockptr(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + + cur = btrfs_read_node_slot(parent, i); + if (IS_ERR(cur)) + return PTR_ERR(cur); + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + ret = btrfs_force_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + return ret; +} + +/* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to * better reflect disk order */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; @@ -461,6 +566,45 @@ done: } /* + * Defrag a given btree. Every leaf in the btree is read and defragmented. + */ +int btrfs_defrag_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) + return 0; + + while (1) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + ret = btrfs_defrag_leaves(trans, root); + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + cond_resched(); + + if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) + break; + + if (btrfs_defrag_cancelled(fs_info)) { + btrfs_debug(fs_info, "defrag_root cancelled"); + ret = -EAGAIN; + break; + } + } + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); + return ret; +} + +/* * Defrag specific helper to get an extent map. * * Differences between this and btrfs_get_extent() are: @@ -891,8 +1035,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) + if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5305f2283b5e..5a62763528d1 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -12,7 +12,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root); static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 427abaf608b8..2833e8ef4c09 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -199,7 +199,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, start = round_down(start, fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(fs_info, len); - btrfs_qgroup_free_data(inode, reserved, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } /* @@ -322,9 +322,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, } else { if (current->journal_info) flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); @@ -346,7 +343,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, noflush); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6d51db066503..7381241334e8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -313,7 +313,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, { struct btrfs_delayed_item *item; - item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); + item = kmalloc(struct_size(item, data, data_len), GFP_NOFS); if (item) { item->data_len = data_len; item->type = type; @@ -328,7 +328,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, } /* - * __btrfs_lookup_delayed_item - look up the delayed item by key + * Look up the delayed item by key. + * * @delayed_node: pointer to the delayed node * @index: the dir index value to lookup (offset of a dir index key) * @@ -412,6 +413,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { + struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; @@ -419,18 +421,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; - delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + /* If it's in a rbtree, then we need to have delayed node locked. */ + lockdep_assert_held(&delayed_node->mutex); + + delayed_root = delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) - root = &delayed_item->delayed_node->ins_root; + root = &delayed_node->ins_root; else - root = &delayed_item->delayed_node->del_root; + root = &delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); RB_CLEAR_NODE(&delayed_item->rb_node); - delayed_item->delayed_node->count--; + delayed_node->count--; finish_one_item(delayed_root); } @@ -513,7 +518,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, /* * For insertions we track reserved metadata space by accounting * for the number of leaves that will be used, based on the delayed - * node's index_items_size field. + * node's curr_index_batch_size and index_item_leaves fields. */ if (item->type == BTRFS_DELAYED_DELETION_ITEM) item->bytes_reserved = num_bytes; @@ -1026,7 +1031,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; @@ -1153,20 +1158,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { - btrfs_release_delayed_node(curr_node); - curr_node = NULL; btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; curr_node = btrfs_next_delayed_node(curr_node); + /* + * See the comment below about releasing path before releasing + * node. If the commit of delayed items was successful the path + * should always be released, but in case of an error, it may + * point to locked extent buffers (a leaf at the very least). + */ + ASSERT(path->nodes[0] == NULL); btrfs_release_delayed_node(prev_node); } + /* + * Release the path to avoid a potential deadlock and lockdep splat when + * releasing the delayed node, as that requires taking the delayed node's + * mutex. If another task starts running delayed items before we take + * the mutex, it will first lock the mutex and then it may try to lock + * the same btree path (leaf). + */ + btrfs_free_path(path); + if (curr_node) btrfs_release_delayed_node(curr_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1361,8 +1379,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, - NULL); + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL); async_work->nr = nr; btrfs_queue_work(fs_info->delayed_workers, &async_work->work); @@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } -/* Will return 0 or -ENOMEM */ +static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; +} + +/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, @@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); + /* + * First attempt to insert the delayed item. This is to make the error + * handling path simpler in case we fail (-EEXIST). There's no risk of + * any other task coming in and running the delayed item before we do + * the metadata space reservation below, because we are holding the + * delayed node's mutex and that mutex must also be locked before the + * node's delayed items can be run. + */ + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(trans->fs_info, +"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", + name_len, name, index, btrfs_root_id(delayed_node->root), + delayed_node->inode_id, dir->index_cnt, + delayed_node->index_cnt, ret); + btrfs_release_delayed_item(delayed_item); + btrfs_release_dir_index_item_space(trans); + mutex_unlock(&delayed_node->mutex); + goto release_node; + } + if (delayed_node->index_item_leaves == 0 || delayed_node->curr_index_batch_size + data_len > leaf_data_size) { delayed_node->curr_index_batch_size = data_len; @@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, * impossible. */ if (WARN_ON(ret)) { - mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_item(delayed_item); + mutex_unlock(&delayed_node->mutex); goto release_node; } delayed_node->index_item_leaves++; - } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - - /* - * Adding the new dir index item does not require touching another - * leaf, so we can release 1 unit of metadata that was previously - * reserved when starting the transaction. This applies only to - * the case where we had a transaction start and excludes the - * transaction join case (when replaying log trees). - */ - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, bytes, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); - ASSERT(trans->bytes_reserved >= bytes); - trans->bytes_reserved -= bytes; - } - - ret = __btrfs_add_delayed_item(delayed_node, delayed_item); - if (unlikely(ret)) { - btrfs_err(trans->fs_info, - "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->root_key.objectid, - delayed_node->inode_id, ret); - BUG(); + } else { + btrfs_release_dir_index_item_space(trans); } mutex_unlock(&delayed_node->mutex); @@ -1722,8 +1760,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, } /* - * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree - * + * Read dir info stored in the delayed tree. */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list) @@ -1736,9 +1773,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, int over = 0; unsigned char d_type; - if (list_empty(ins_list)) - return 0; - /* * Changing the data of the delayed item is impossible. So * we needn't lock them. And we have held i_mutex of the @@ -1799,24 +1833,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) @@ -1856,19 +1888,17 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + btrfs_stack_timespec_nsec(&inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); + inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); + inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_stack_timespec_nsec(&inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1879,9 +1909,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) } int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; int ret = 0; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index dc1085b2a397..5cceb31bbd16 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -95,7 +95,7 @@ struct btrfs_delayed_item { bool logged; /* The maximum leaf size is 64K, so u16 is more than enough. */ u16 data_len; - char data[]; + char data[] __counted_by(data_len); }; static inline void btrfs_init_delayed_root( @@ -135,7 +135,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6a13cf00218b..891ea2fa263c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -57,16 +57,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) * Release a ref head's reservation. * * @fs_info: the filesystem - * @nr: number of items to drop + * @nr_refs: number of delayed refs to drop + * @nr_csums: number of csum items to drop * * Drops the delayed ref head's count from the delayed refs rsv and free any * excess reservation we had. */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); - u64 released = 0; + u64 num_bytes; + u64 released; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) @@ -77,50 +81,135 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) /* * Adjust the size of the delayed refs rsv. * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. + * This is to be called anytime we may have adjusted trans->delayed_ref_updates + * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and + * add it to the delayed_refs_rsv. */ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; u64 num_bytes; + u64 reserved_bytes; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, + trans->delayed_ref_csum_deletions); - if (!trans->delayed_ref_updates) + if (num_bytes == 0) return; - num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - trans->delayed_ref_updates); + /* + * Try to take num_bytes from the transaction's local delayed reserve. + * If not possible, try to take as much as it's available. If the local + * reserve doesn't have enough reserved space, the delayed refs reserve + * will be refilled next time btrfs_delayed_refs_rsv_refill() is called + * by someone or if a transaction commit is triggered before that, the + * global block reserve will be used. We want to minimize using the + * global block reserve for cases we can account for in advance, to + * avoid exhausting it and reach -ENOSPC during a transaction commit. + */ + spin_lock(&local_rsv->lock); + reserved_bytes = min(num_bytes, local_rsv->reserved); + local_rsv->reserved -= reserved_bytes; + local_rsv->full = (local_rsv->reserved >= local_rsv->size); + spin_unlock(&local_rsv->lock); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = false; + delayed_rsv->reserved += reserved_bytes; + delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; + trans->delayed_ref_csum_deletions = 0; +} + +/* + * Adjust the size of the delayed refs block reserve for 1 block group item + * insertion, used after allocating a block group. + */ +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + + spin_lock(&delayed_rsv->lock); + /* + * Inserting a block group item does not require changing the free space + * tree, only the extent tree or the block group tree, so this is all we + * need. + */ + delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} + +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item insertion. + */ +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * Adjust the size of the delayed refs block reserve for 1 block group item + * update. + */ +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + + spin_lock(&delayed_rsv->lock); + /* + * Updating a block group item does not result in new nodes/leaves and + * does not require changing the free space tree, only the extent tree + * or the block group tree, so this is all we need. + */ + delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} + +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item update. + */ +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } /* * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem - * @src: source block rsv to transfer from * @num_bytes: number of bytes to transfer * - * This transfers up to the num_bytes amount from the src rsv to the + * This transfers up to the num_bytes amount, previously reserved, to the * delayed_refs_rsv. Any extra bytes are returned to the space info. */ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; u64 to_free = 0; - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - spin_lock(&delayed_refs_rsv->lock); if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { u64 delta = delayed_refs_rsv->size - @@ -161,8 +250,11 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_space_info *space_info = block_rsv->space_info; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; + u64 refilled_bytes; + u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); @@ -175,12 +267,40 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); if (ret) return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); + + /* + * We may have raced with someone else, so check again if we the block + * reserve is still not full and release any excess space. + */ + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + u64 needed = block_rsv->size - block_rsv->reserved; + + if (num_bytes >= needed) { + block_rsv->reserved += needed; + block_rsv->full = true; + to_free = num_bytes - needed; + refilled_bytes = needed; + } else { + block_rsv->reserved += num_bytes; + to_free = 0; + refilled_bytes = num_bytes; + } + } else { + to_free = num_bytes; + refilled_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (to_free > 0) + btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + + if (refilled_bytes > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, + refilled_bytes, 1); return 0; } @@ -398,7 +518,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -409,9 +530,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, list_del(&ref->add_list); btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } -static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -440,10 +563,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, mod = -next->ref_mod; } - drop_delayed_ref(delayed_refs, head, next); + drop_delayed_ref(fs_info, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(delayed_refs, head, ref); + drop_delayed_ref(fs_info, delayed_refs, head, ref); done = true; } else { /* @@ -481,7 +604,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(delayed_refs, head, ref, seq)) + if (merge_ref(fs_info, delayed_refs, head, ref, seq)) goto again; } } @@ -560,10 +683,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return true if the ref was merged into an existing one (and therefore can be * freed by the caller). */ -static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, +static bool insert_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { + struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; struct btrfs_delayed_ref_node *exist; int mod; @@ -574,6 +698,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); spin_unlock(&href->lock); + trans->delayed_ref_updates++; return false; } @@ -602,7 +727,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(root, href, exist); + drop_delayed_ref(trans->fs_info, root, href, exist); spin_unlock(&href->lock); return true; } @@ -623,6 +748,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, BUG_ON(existing->is_data != update->is_data); spin_lock(&existing->lock); + + /* + * When freeing an extent, we may not know the owning root when we + * first create the head_ref. However, some deref before the last deref + * will know it, so we just need to update the head_ref accordingly. + */ + if (!existing->owning_root) + existing->owning_root = update->owning_root; + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -632,6 +766,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * Set it again here */ existing->must_insert_reserved = update->must_insert_reserved; + existing->owning_root = update->owning_root; /* * update the num_bytes so we make sure the accounting @@ -671,6 +806,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. + * We reserve bytes for csum deletion when adding or updating a ref head + * see add_delayed_ref_head() for more details. */ if (existing->is_data) { u64 csum_leaves = @@ -679,11 +816,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); } if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; - trans->delayed_ref_updates += csum_leaves; + trans->delayed_ref_csum_deletions += csum_leaves; } } @@ -694,7 +831,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, bool is_data, - bool is_system) + bool is_system, u64 owning_root) { int count_mod = 1; bool must_insert_reserved = false; @@ -734,7 +871,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->bytenr = bytenr; head_ref->num_bytes = num_bytes; head_ref->ref_mod = count_mod; + head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; + head_ref->owning_root = owning_root; head_ref->is_data = is_data; head_ref->is_system = is_system; head_ref->ref_tree = RB_ROOT_CACHED; @@ -795,16 +934,21 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + /* + * We reserve the amount of bytes needed to delete csums when + * adding the ref head and not when adding individual drop refs + * since the csum items are deleted only after running the last + * delayed drop ref (the data extent's ref count drops to 0). + */ if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; - trans->delayed_ref_updates += + trans->delayed_ref_csum_deletions += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -813,8 +957,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * init_delayed_ref_common - Initialize the structure which represents a - * modification to a an extent. + * Initialize the structure which represents a modification to a an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * @@ -885,7 +1028,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 parent = generic_ref->parent; u8 ref_type; - is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); + is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); @@ -898,8 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -914,15 +1056,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, action, + generic_ref->tree_ref.ref_root, action, ref_type); - ref->root = generic_ref->tree_ref.owning_root; + ref->root = generic_ref->tree_ref.ref_root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, 0, action, - false, is_system); + generic_ref->tree_ref.ref_root, 0, action, + false, is_system, generic_ref->owning_root); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -935,7 +1077,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -974,7 +1116,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.owning_root; + u64 ref_root = generic_ref->data_ref.ref_root; u64 owner = generic_ref->data_ref.ino; u64 offset = generic_ref->data_ref.offset; u8 ref_type; @@ -1002,8 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); @@ -1014,7 +1155,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, action, true, false); + reserved, action, true, false, generic_ref->owning_root); head_ref->extent_op = NULL; delayed_refs = &trans->transaction->delayed_refs; @@ -1027,7 +1168,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1060,7 +1201,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, false, false); + BTRFS_UPDATE_DELAYED_HEAD, false, false, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b8e14b0ba5f1..62d679d40f4f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -9,10 +9,16 @@ #include <linux/refcount.h> /* these are the possible values of struct btrfs_delayed_ref_node->action */ -#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ -#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ -#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ -#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +enum btrfs_delayed_ref_action { + /* Add one backref to the tree */ + BTRFS_ADD_DELAYED_REF = 1, + /* Delete one backref from the tree */ + BTRFS_DROP_DELAYED_REF, + /* Record a full extent allocation */ + BTRFS_ADD_DELAYED_EXTENT, + /* Not changing ref count on head ref */ + BTRFS_UPDATE_DELAYED_HEAD, +} __packed; struct btrfs_delayed_ref_node { struct rb_node ref_node; @@ -105,6 +111,18 @@ struct btrfs_delayed_ref_head { int ref_mod; /* + * The root that triggered the allocation when must_insert_reserved is + * set to true. + */ + u64 owning_root; + + /* + * Track reserved bytes when setting must_insert_reserved. On success + * or cleanup, we will need to free the reservation. + */ + u64 reserved_bytes; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -117,6 +135,7 @@ struct btrfs_delayed_ref_head { * the free has happened. */ bool must_insert_reserved; + bool is_data; bool is_system; bool processing; @@ -183,13 +202,13 @@ enum btrfs_ref_type { BTRFS_REF_DATA, BTRFS_REF_METADATA, BTRFS_REF_LAST, -}; +} __packed; struct btrfs_data_ref { /* For EXTENT_DATA_REF */ - /* Original root this data extent belongs to */ - u64 owning_root; + /* Root which owns this data reference. */ + u64 ref_root; /* Inode which refers to this data extent */ u64 ino; @@ -212,18 +231,18 @@ struct btrfs_tree_ref { int level; /* - * Root which owns this tree block. + * Root which owns this tree block reference. * * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) */ - u64 owning_root; + u64 ref_root; /* For non-skinny metadata, no special member needed */ }; struct btrfs_ref { enum btrfs_ref_type type; - int action; + enum btrfs_delayed_ref_action action; /* * Whether this extent should go through qgroup record. @@ -239,6 +258,7 @@ struct btrfs_ref { #endif u64 bytenr; u64 len; + u64 owning_root; /* Bytenr of the parent tree block */ u64 parent; @@ -277,24 +297,37 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in return num_bytes; } +static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info, + int num_csum_items) +{ + /* + * Deleting csum items does not result in new nodes/leaves and does not + * require changing the free space tree, only the csum tree, so this is + * all we need. + */ + return btrfs_calc_metadata_size(fs_info, num_csum_items); +} + static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, u64 parent) + int action, u64 bytenr, u64 len, + u64 parent, u64 owning_root) { generic_ref->action = action; generic_ref->bytenr = bytenr; generic_ref->len = len; generic_ref->parent = parent; + generic_ref->owning_root = owning_root; } -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, - int level, u64 root, u64 mod_root, bool skip_qgroup) +static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, + u64 root, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: root; #endif generic_ref->tree_ref.level = level; - generic_ref->tree_ref.owning_root = root; + generic_ref->tree_ref.ref_root = root; generic_ref->type = BTRFS_REF_METADATA; if (skip_qgroup || !(is_fstree(root) && (!mod_root || is_fstree(mod_root)))) @@ -312,7 +345,7 @@ static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: ref_root; #endif - generic_ref->data_ref.owning_root = ref_root; + generic_ref->data_ref.ref_root = ref_root; generic_ref->data_ref.ino = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; @@ -338,7 +371,6 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(refcount_read(&ref->refs) == 0); if (refcount_dec_and_test(&ref->refs)) { WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); switch (ref->type) { @@ -402,12 +434,15 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums); void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5f10965fd72b..f9544fda38e9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -17,7 +17,6 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" @@ -247,6 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + struct bdev_handle *bdev_handle; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,12 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_handle)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev); + return PTR_ERR(bdev_handle); } + bdev = bdev_handle->bdev; if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -313,9 +314,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; + device->bdev_handle = bdev_handle; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; @@ -334,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - blkdev_put(bdev, fs_info->bdev_holder); + bdev_release(bdev_handle); return ret; } @@ -442,7 +443,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -792,9 +793,9 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (!find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { + while (find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { ret = set_extent_bit(&tgtdev->alloc_state, found_start, found_end, CHUNK_ALLOCATED, NULL); if (ret) diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 082eb0e19598..9c07d5c3e5ad 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -38,7 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(path, data_size); + btrfs_extend_item(trans, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -93,7 +93,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } @@ -153,7 +153,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ @@ -439,7 +439,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(path, item_len - sub_item_len, 1); + btrfs_truncate_item(trans, path, item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index aab4b7cc7fa0..e40a226373d7 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,6 +3,10 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include <linux/crc32c.h> + +struct fscrypt_str; + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, @@ -39,4 +43,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, const char *name, int name_len); +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9a2c5446c18..62cb97f7c94f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" @@ -245,6 +244,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start = btrfs_header_bytenr(eb); + u64 last_trans; u8 result[BTRFS_CSUM_SIZE]; int ret; @@ -282,12 +282,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) * Also check the generation, the eb reached here must be newer than * last committed. Or something seriously wrong happened. */ - if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (unlikely(btrfs_header_generation(eb) <= last_trans)) { ret = -EUCLEAN; btrfs_err(fs_info, "block=%llu bad generation, have %llu expect > %llu", - eb->start, btrfs_header_generation(eb), - fs_info->last_trans_committed); + eb->start, btrfs_header_generation(eb), last_trans); goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); @@ -313,21 +313,17 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); + /* - * Checking the incompat flag is only valid for the current fs. For - * seed devices it's forbidden to have their uuid changed so reading - * ->fsid in this case is fine + * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid. + * This is then overwritten by metadata_uuid if it is present in the + * device_list_add(). The same true for a seed device as well. So use of + * fs_devices::metadata_uuid is appropriate here. */ - if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; - - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) @@ -525,6 +521,7 @@ static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; @@ -538,18 +535,19 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); return filemap_dirty_folio(mapping, folio); } + + ASSERT(spi); subpage = folio_get_private(folio); - ASSERT(subpage->dirty_bitmap); - while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + for (cur_bit = spi->dirty_offset; + cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; + cur_bit++) { unsigned long flags; u64 cur; - u16 tmp = (1 << cur_bit); spin_lock_irqsave(&subpage->lock, flags); - if (!(tmp & subpage->dirty_bitmap)) { + if (!test_bit(cur_bit, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - cur_bit++; continue; } spin_unlock_irqrestore(&subpage->lock, flags); @@ -562,7 +560,7 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); - cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; } return filemap_dirty_folio(mapping, folio); } @@ -678,9 +676,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, @@ -862,7 +860,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.offset = 0; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -870,7 +868,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, } root->node = leaf; - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -939,13 +937,13 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); + NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) return PTR_ERR(leaf); root->node = leaf; - btrfs_mark_buffer_dirty(root->node); + btrfs_mark_buffer_dirty(trans, root->node); btrfs_tree_unlock(root->node); return 0; @@ -1007,9 +1005,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); return 0; } @@ -1182,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(fs_info->block_group_root); case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_RAID_STRIPE_TREE_OBJECTID: + return btrfs_grab_root(fs_info->stripe_root); default: return NULL; } @@ -1262,6 +1262,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); + btrfs_put_root(fs_info->stripe_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1405,7 +1406,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, } /* - * btrfs_get_fs_root_commit_root - return a root for the given objectid + * Return a root for the given objectid. + * * @fs_info: the fs_info * @objectid: the objectid we need to lookup * @@ -1552,7 +1554,7 @@ static int transaction_kthread(void *arg) delta = ktime_get_seconds() - cur->start_time; if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && - cur->state < TRANS_STATE_COMMIT_START && + cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -1702,11 +1704,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) } /* - * read_backup_root - Reads a backup root based on the passed priority. Prio 0 - * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots + * Reads a backup root based on the passed priority. Prio 0 is the newest, prio + * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots * - * fs_info - filesystem whose backup roots need to be read - * priority - priority of backup root required + * @fs_info: filesystem whose backup roots need to be read + * @priority: priority of backup root required * * Returns backup root index on success and -EINVAL otherwise. */ @@ -1806,6 +1808,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); + free_root_extent_buffers(info->stripe_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2265,7 +2268,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); fs_info->quota_root = root; } @@ -2282,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->stripe_root = root; + } + } + return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", @@ -2384,21 +2400,19 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)) { + if (!fs_info->fs_devices->temp_fsid && + memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", - fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } - if (btrfs_fs_incompat(fs_info, METADATA_UUID) && - memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", - fs_info->super_copy->metadata_uuid, - fs_info->fs_devices->metadata_uuid); + btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } @@ -2640,7 +2654,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = btrfs_header_generation(tree_root->node); - fs_info->last_trans_committed = fs_info->generation; + btrfs_set_last_trans_committed(fs_info, fs_info->generation); fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -2690,8 +2704,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); - btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, BTRFS_LOCKDEP_TRANS_UNBLOCKED); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, @@ -2741,9 +2755,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->ordered_root_lock); btrfs_init_scrub(fs_info); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - fs_info->check_integrity_print_mask = 0; -#endif btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); @@ -2869,6 +2880,56 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; + int err = 0; + unsigned int ret = 0; + + while (1) { + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; + } + root_objectid = gang[ret - 1]->root_key.objectid + 1; + + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots. */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* Grab all the search result for later use. */ + gang[i] = btrfs_grab_root(gang[i]); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + goto out; + btrfs_put_root(gang[i]); + } + root_objectid++; + } +out: + /* Release the uncleaned roots due to error. */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); + } + return err; +} + /* * Some options only have meaning at mount time and shouldn't persist across * remounts, or be displayed. Clear these at the end of mount and remount @@ -3113,7 +3174,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device u32 nodesize; u32 stripesize; u64 generation; - u64 features; u16 csum_type; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -3153,6 +3213,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid); /* * Verify the type first, if that or the checksum value are * corrupted, we'll find out @@ -3195,15 +3256,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device disk_super = fs_info->super_copy; - - features = btrfs_super_flags(disk_super); - if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { - features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; - btrfs_set_super_flags(disk_super, features); - btrfs_info(fs_info, - "found metadata UUID change in progress flag, clearing"); - } - memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); @@ -3222,7 +3274,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, -EUCLEAN); /* * In the long term, we'll store the compression type in the super @@ -3417,6 +3469,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_free_zone_cache(fs_info); + btrfs_check_active_zone_reservation(fs_info); + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, @@ -3463,18 +3517,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device "auto enabling async discard"); } -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { - ret = btrfsic_mount(fs_info, fs_devices, - btrfs_test_opt(fs_info, - CHECK_INTEGRITY_DATA) ? 1 : 0, - fs_info->check_integrity_print_mask); - if (ret) - btrfs_warn(fs_info, - "failed to initialize integrity check module: %d", - ret); - } -#endif ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; @@ -3774,8 +3816,6 @@ static int write_dev_supers(struct btrfs_device *device, */ if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - - btrfsic_check_bio(bio); submit_bio(bio); if (btrfs_advance_sb_log(device, i)) @@ -3871,28 +3911,11 @@ static void write_dev_flush(struct btrfs_device *device) device->last_flush_error = BLK_STS_OK; -#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * When a disk has write caching disabled, we skip submission of a bio - * with flush and sync requests before writing the superblock, since - * it's not needed. However when the integrity checker is enabled, this - * results in reports that there are metadata blocks referred by a - * superblock that were not properly flushed. So don't skip the bio - * submission only when the integrity checker is enabled for the sake - * of simplicity, since this is a debug tool and not meant for use in - * non-debug builds. - */ - if (!bdev_write_cache(device->bdev)) - return; -#endif - bio_init(bio, device->bdev, NULL, 0, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - - btrfsic_check_bio(bio); submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4136,56 +4159,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, btrfs_put_root(root); } -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) -{ - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; - - while (1) { - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) { - spin_unlock(&fs_info->fs_roots_radix_lock); - break; - } - root_objectid = gang[ret - 1]->root_key.objectid + 1; - - for (i = 0; i < ret; i++) { - /* Avoid to grab roots in dead_roots */ - if (btrfs_root_refs(&gang[i]->root_item) == 0) { - gang[i] = NULL; - continue; - } - /* grab all the search result for later use */ - gang[i] = btrfs_grab_root(gang[i]); - } - spin_unlock(&fs_info->fs_roots_radix_lock); - - for (i = 0; i < ret; i++) { - if (!gang[i]) - continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; - btrfs_put_root(gang[i]); - } - root_objectid++; - } -out: - /* release the uncleaned roots due to error */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; -} - int btrfs_commit_super(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->tree_root; @@ -4228,7 +4201,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (!find_first_extent_bit(&trans->dirty_pages, cur, + while (find_first_extent_bit(&trans->dirty_pages, cur, &found_start, &found_end, EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; @@ -4418,16 +4391,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) - btrfsic_unmount(fs_info->fs_devices); -#endif - btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); } -void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); @@ -4441,21 +4410,16 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif + /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */ + ASSERT(trans->transid == fs_info->generation); btrfs_assert_tree_write_locked(buf); - if (transid != fs_info->generation) - WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", - buf->start, transid, fs_info->generation); - set_extent_buffer_dirty(buf); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * btrfs_check_leaf() won't check item data if we don't have WRITTEN - * set, so this will only validate the basic structure of the items. - */ - if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { - btrfs_print_leaf(buf); - ASSERT(0); + if (unlikely(transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu", + buf->start, transid, fs_info->generation); } -#endif + set_extent_buffer_dirty(buf); } static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, @@ -4552,9 +4516,7 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); @@ -4617,6 +4579,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } if (head->must_insert_reserved) pin_bytes = true; @@ -4660,9 +4623,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -4695,9 +4656,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); @@ -4716,21 +4675,16 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delalloc_root_lock); } -static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, - int mark) +static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages, + int mark) { - int ret; struct extent_buffer *eb; u64 start = 0; u64 end; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL); - if (ret) - break; - + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); @@ -4746,16 +4700,13 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, free_extent_buffer_stale(eb); } } - - return ret; } -static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *unpin) +static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, + struct extent_io_tree *unpin) { u64 start; u64 end; - int ret; while (1) { struct extent_state *cached_state = NULL; @@ -4767,9 +4718,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } @@ -4780,8 +4730,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } - - return 0; } static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) @@ -4829,7 +4777,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4851,6 +4799,32 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, } } +static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *gang[8]; + int i; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + + btrfs_qgroup_free_meta_all_pertrans(root); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + } + } + spin_unlock(&fs_info->fs_roots_radix_lock); +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { @@ -4879,6 +4853,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_all_qgroup_pertrans(fs_info); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } @@ -4893,7 +4869,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->trans_list)) { t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); - if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state >= TRANS_STATE_COMMIT_PREP) { refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b03767f4d7ed..50dab8f639dc 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -77,7 +77,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, @@ -105,7 +104,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) } void btrfs_put_root(struct btrfs_root *root); -void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index a2315a4b8b75..ea149be28dff 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -105,32 +105,40 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, lockdep_set_class(&tree->lock, &file_extent_tree_class); } +/* + * Empty an io tree, removing and freeing every extent state record from the + * tree. This should be called once we are sure no other task can access the + * tree anymore, so no tree updates happen after we empty the tree and there + * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * set on any extent state when calling this function). + */ void extent_io_tree_release(struct extent_io_tree *tree) { + struct rb_root root; + struct extent_state *state; + struct extent_state *tmp; + spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); + root = tree->state; + tree->state = RB_ROOT; + rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { + /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); + ASSERT(!(state->state & EXTENT_LOCKED)); /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. + * No need for a memory barrier here, as we are holding the tree + * lock and we only change the waitqueue while holding that lock + * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - cond_resched_lock(&tree->lock); } + /* + * Should still be empty even after a reschedule, no other task should + * be accessing the tree anymore. + */ + ASSERT(RB_EMPTY_ROOT(&tree->state)); spin_unlock(&tree->lock); } @@ -327,6 +335,36 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) "locking error: extent tree was modified by another thread while locked"); } +static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *prev; + + prev = prev_state(state); + if (prev && prev->end == state->start - 1 && prev->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, prev); + state->start = prev->start; + rb_erase(&prev->rb_node, &tree->state); + RB_CLEAR_NODE(&prev->rb_node); + free_extent_state(prev); + } +} + +static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *next; + + next = next_state(state); + if (next && next->start == state->end + 1 && next->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, next); + state->end = next->end; + rb_erase(&next->rb_node, &tree->state); + RB_CLEAR_NODE(&next->rb_node); + free_extent_state(next); + } +} + /* * Utility function to look for merge candidates inside a given range. Any * extents with matching state are merged together into a single extent in the @@ -338,31 +376,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - struct extent_state *other; - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) return; - other = prev_state(state); - if (other && other->end == state->start - 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - other = next_state(state); - if (other && other->start == state->end + 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } + merge_prev_state(tree, state); + merge_next_state(tree, state); } static void set_state_bits(struct extent_io_tree *tree, @@ -384,19 +402,27 @@ static void set_state_bits(struct extent_io_tree *tree, * Insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. + * Returns a pointer to the struct extent_state record containing the range + * requested for insertion, which may be the same as the given struct or it + * may be an existing record in the tree that was expanded to accommodate the + * requested range. In case of an extent_state different from the one that was + * given, the later can be freed or reused by the caller. + * + * On error it returns an error pointer. * * The tree lock is not taken internally. This is a utility function and * probably isn't what you want to call (see set/clear_extent_bit). */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) +static struct extent_state *insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, + struct extent_changeset *changeset) { struct rb_node **node; struct rb_node *parent = NULL; - const u64 end = state->end; + const u64 start = state->start - 1; + const u64 end = state->end + 1; + const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -407,23 +433,42 @@ static int insert_state(struct extent_io_tree *tree, parent = *node; entry = rb_entry(parent, struct extent_state, rb_node); - if (end < entry->start) { + if (state->end < entry->start) { + if (try_merge && end == entry->start && + state->state == entry->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); + entry->start = state->start; + merge_prev_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_left; - } else if (end > entry->end) { + } else if (state->end > entry->end) { + if (try_merge && entry->end == start && + state->state == entry->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); + entry->end = state->end; + merge_next_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_right; } else { btrfs_err(tree->fs_info, "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, end); - return -EEXIST; + entry->start, entry->end, state->start, state->end); + return ERR_PTR(-EEXIST); } } rb_link_node(&state->rb_node, parent, node); rb_insert_color(&state->rb_node, &tree->state); - merge_state(tree, state); - return 0; + return state; } /* @@ -708,26 +753,13 @@ out: } -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - /* * Wait for one or more bits to clear on a range in the state tree. * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { struct extent_state *state; @@ -758,9 +790,15 @@ process_node: goto out; if (state->state & bits) { + DEFINE_WAIT(wait); + start = state->start; refcount_inc(&state->refs); - wait_on_state(tree, state); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); free_extent_state(state); goto again; } @@ -831,15 +869,15 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * * Note: If there are multiple bits set in @bits, any of them will match. * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. + * Return true if we find something, and update @start_ret and @end_ret. + * Return false if we found nothing. */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; - int ret = 1; + bool ret = false; spin_lock(&tree->lock); if (cached_state && *cached_state) { @@ -847,10 +885,19 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, if (state->end == start - 1 && extent_state_in_tree(state)) { while ((state = next_state(state)) != NULL) { if (state->state & bits) - goto got_it; + break; } + /* + * If we found the next extent state, clear cached_state + * so that we can cache the next extent state below and + * avoid future calls going over the same extent state + * again. If we haven't found any, clear as well since + * it's now useless. + */ free_extent_state(*cached_state); *cached_state = NULL; + if (state) + goto got_it; goto out; } free_extent_state(*cached_state); @@ -863,7 +910,7 @@ got_it: cache_state_if_flags(state, cached_state, 0); *start_ret = state->start; *end_ret = state->end; - ret = 0; + ret = true; } out: spin_unlock(&tree->lock); @@ -1133,6 +1180,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1148,12 +1197,15 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, changeset); - if (err) + inserted_state = insert_state(tree, prealloc, bits, changeset); + if (IS_ERR(inserted_state)) { + err = PTR_ERR(inserted_state); extent_io_tree_panic(tree, err); + } - cache_state(prealloc, cached_state); - prealloc = NULL; + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1356,6 +1408,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1373,11 +1427,14 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, NULL); - if (err) + inserted_state = insert_state(tree, prealloc, bits, NULL); + if (IS_ERR(inserted_state)) { + err = PTR_ERR(inserted_state); extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; + } + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1640,15 +1697,46 @@ search: } /* - * Search a range in the state tree for a given mask. If 'filled' == 1, this - * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 - * is returned if any bit in the range is found set. + * Check if the single @bit exists in the given range. */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) { struct extent_state *state = NULL; - int bitset = 0; + bool bitset = false; + + ASSERT(is_power_of_2(bit)); + + spin_lock(&tree->lock); + state = tree_search(tree, start); + while (state && start <= end) { + if (state->start > end) + break; + + if (state->state & bit) { + bitset = true; + break; + } + + /* If state->end is (u64)-1, start will overflow to 0 */ + start = state->end + 1; + if (start > end || start == 0) + break; + state = next_state(state); + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * Check if the whole range [@start,@end) contains the single @bit set. + */ +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) +{ + struct extent_state *state = NULL; + bool bitset = true; + + ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1657,35 +1745,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, else state = tree_search(tree, start); while (state && start <= end) { - if (filled && state->start > start) { - bitset = 0; + if (state->start > start) { + bitset = false; break; } if (state->start > end) break; - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; + if ((state->state & bit) == 0) { + bitset = false; break; } if (state->end == (u64)-1) break; + /* + * Last entry (if state->end is (u64)-1 and overflow happens), + * or next entry starts after the range. + */ start = state->end + 1; - if (start > end) + if (start > end || start == 0) break; state = next_state(state); } /* We ran out of states and were still inside of our range. */ - if (filled && !state) - bitset = 0; + if (!state) + bitset = false; spin_unlock(&tree->lock); return bitset; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index fbd3b275ab1c..5602b0137fcd 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -131,8 +131,9 @@ u64 count_range_bits(struct extent_io_tree *tree, struct extent_state **cached_state); void free_extent_state(struct extent_state *state); -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached_state); +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -182,9 +183,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, @@ -192,7 +193,5 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396a9afa403..01423670bc8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -42,14 +42,16 @@ #include "file-item.h" #include "orphan.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + u64 owner_offset, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -57,7 +59,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod); + struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); @@ -69,27 +71,6 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) return (cache->flags & bits) == bits; } -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) -{ - u64 end = start + num_bytes - 1; - set_extent_bit(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE, NULL); - return 0; -} - -void btrfs_free_excluded_extents(struct btrfs_block_group *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 start, end; - - start = cache->start; - end = start + cache->length - 1; - - clear_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -121,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; @@ -133,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u32 item_size; u64 num_refs; u64 extent_flags; + u64 owner = 0; int ret; /* @@ -186,9 +169,13 @@ search_again: struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, + path->slots[0]); } else { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); if (trans) btrfs_abort_transaction(trans, ret); else @@ -243,6 +230,8 @@ out: *refs = num_refs; if (flags) *flags = extent_flags; + if (owning_root) + *owning_root = owner; out_free: btrfs_free_path(path); return ret; @@ -363,9 +352,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { + struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + return type; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || @@ -374,26 +369,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { @@ -402,11 +396,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } + WARN_ON(1); btrfs_print_leaf(eb); - btrfs_err(eb->fs_info, + btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); - WARN_ON(1); return BTRFS_REF_TYPE_INVALID; } @@ -418,11 +412,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -594,7 +588,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -624,12 +618,12 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, -EINVAL); - return -EINVAL; } else { - BUG(); + btrfs_err(trans->fs_info, + "unrecognized backref key (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + return -EUCLEAN; } BUG_ON(num_refs < refs_to_drop); @@ -642,7 +636,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } return ret; } @@ -660,7 +654,6 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (iref) { /* * If type is invalid, we should have bailed out earlier than @@ -809,7 +802,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int type; int want; int ret; - int err = 0; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); int needed; @@ -836,10 +828,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } /* * We may be a newly converted file system which still has the old fat @@ -866,19 +856,26 @@ again: } if (ret && !insert) { - err = -ENOENT; + ret = -ENOENT; goto out; } else if (WARN_ON(ret)) { - err = -EIO; + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", + bytenr, num_bytes, parent, root_objectid, owner, + offset); + ret = -EUCLEAN; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %llu expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); goto out; } @@ -898,22 +895,17 @@ again: else needed = BTRFS_REF_TYPE_BLOCK; - err = -ENOENT; - while (1) { - if (ptr >= end) { - if (ptr > end) { - err = -EUCLEAN; - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, -"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", - path->slots[0], root_objectid, owner, offset, parent); - } - break; - } + ret = -ENOENT; + while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ptr += btrfs_extent_inline_ref_size(type); + continue; + } if (type == BTRFS_REF_TYPE_INVALID) { - err = -EUCLEAN; + ret = -EUCLEAN; goto out; } @@ -929,7 +921,7 @@ again: dref = (struct btrfs_extent_data_ref *)(&iref->offset); if (match_extent_data_ref(leaf, dref, root_objectid, owner, offset)) { - err = 0; + ret = 0; break; } if (hash_extent_data_ref_item(leaf, dref) < @@ -940,14 +932,14 @@ again: ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); if (parent > 0) { if (parent == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < parent) break; } else { if (root_objectid == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < root_objectid) @@ -956,10 +948,20 @@ again: } ptr += btrfs_extent_inline_ref_size(type); } - if (err == -ENOENT && insert) { + + if (unlikely(ptr > end)) { + ret = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + goto out; + } + + if (ret == -ENOENT && insert) { if (item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* @@ -971,7 +973,7 @@ again: if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } } @@ -982,14 +984,14 @@ out: path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } - return err; + return ret; } /* * helper to add new inline back ref */ static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, @@ -1012,7 +1014,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(path, size); + btrfs_extend_item(trans, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1046,7 +1048,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1079,13 +1081,15 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, /* * helper to update/remove inline back ref */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_path *path, +static noinline_for_stack int update_inline_extent_backref( + struct btrfs_trans_handle *trans, + struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1098,18 +1102,33 @@ void update_inline_extent_backref(struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", + key.objectid, extent_size, refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* - * If type is invalid, we should have bailed out after - * lookup_inline_extent_backref(). + * Function btrfs_get_extent_inline_ref_type() has already printed + * error messages. */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) + return -EUCLEAN; if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -1119,10 +1138,43 @@ void update_inline_extent_backref(struct btrfs_path *path, refs = btrfs_shared_data_ref_count(leaf, sref); } else { refs = 1; - BUG_ON(refs_to_mod != -1); + /* + * For tree blocks we can only drop one ref for it, and tree + * blocks should not have refs > 1. + * + * Furthermore if we're inserting a new inline backref, we + * won't reach this path either. That would be + * setup_inline_extent_backref(). + */ + if (unlikely(refs_to_mod != -1)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for tree block %llu, has %d expect -1", + key.objectid, refs_to_mod); + return -EUCLEAN; + } } - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, +"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", + (unsigned long)iref, key.objectid, extent_size, + refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; if (refs > 0) { @@ -1139,9 +1191,10 @@ void update_inline_extent_backref(struct btrfs_path *path, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(path, item_size, 1); + btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); + return 0; } static noinline_for_stack @@ -1170,9 +1223,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, extent_op); + ret = update_inline_extent_backref(trans, path, iref, + refs_to_add, extent_op); } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans->fs_info, path, iref, parent, + setup_inline_extent_backref(trans, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1190,7 +1244,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) - update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + ret = update_inline_extent_backref(trans, path, iref, + -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else @@ -1386,7 +1441,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1399,7 +1454,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, } /* - * __btrfs_inc_extent_ref - insert backreference for a given extent + * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. @@ -1429,8 +1484,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * always passed as 0. For data extents it is the fileoffset * this extent belongs to. * - * @refs_to_add Number of references to add - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * @@ -1438,7 +1491,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, + u64 owner, u64 offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; @@ -1448,6 +1501,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; u64 refs; + int refs_to_add = node->ref_mod; int ret; path = btrfs_alloc_path(); @@ -1474,19 +1528,18 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); + if (owner < BTRFS_FIRST_FREE_OBJECTID) ret = insert_tree_block_ref(trans, path, bytenr, parent, root_objectid); - } else { + else ret = insert_extent_data_ref(trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - } + if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1494,45 +1547,72 @@ out: return ret; } +static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *href) +{ + u64 root = href->owning_root; + + /* + * Don't check must_insert_reserved, as this is called from contexts + * where it has already been unset. + */ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || + !href->is_data || !is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, + BTRFS_QGROUP_RSV_DATA); +} + static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; u64 parent = 0; - u64 ref_root = 0; u64 flags = 0; - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - ref = btrfs_delayed_node_to_data_ref(node); trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_key key; + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = node->num_bytes, + .is_data = true, + .is_inc = true, + .generation = trans->transid, + }; + if (extent_op) flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, parent, ref_root, + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + ret = alloc_reserved_file_extent(trans, parent, ref->root, flags, ref->objectid, - ref->offset, &ins, - node->ref_mod); + ref->offset, &key, + node->ref_mod, href->owning_root); + free_head_ref_squota_rsv(trans->fs_info, href); + if (!ret) + ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root, ref->objectid, ref->offset, - node->ref_mod, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); + ret = __btrfs_free_extent(trans, href, node, parent, + ref->root, ref->objectid, + ref->offset, extent_op); } else { BUG(); } @@ -1569,7 +1649,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 item_size; int ret; - int err = 0; int metadata = 1; if (TRANS_ABORTED(trans)) @@ -1596,10 +1675,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - err = ret; goto out; - } - if (ret > 0) { + } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--; @@ -1620,7 +1697,10 @@ again: goto again; } } else { - err = -EIO; + ret = -EUCLEAN; + btrfs_err(fs_info, + "missing extent item for extent %llu num_bytes %llu level %d", + head->bytenr, head->num_bytes, extent_op->level); goto out; } } @@ -1629,27 +1709,31 @@ again: item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_abort_transaction(trans, err); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; u64 parent = 0; u64 ref_root = 0; @@ -1661,22 +1745,32 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - if (node->ref_mod != 1) { + if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); - return -EIO; + return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = fs_info->nodesize, + .is_data = false, + .is_inc = true, + .generation = trans->transid, + }; + BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, node, extent_op); + if (!ret) + btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_free_extent(trans, href, node, parent, ref_root, + ref->level, 0, extent_op); } else { BUG(); } @@ -1685,6 +1779,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) @@ -1692,19 +1787,23 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, int ret = 0; if (TRANS_ABORTED(trans)) { - if (insert_reserved) + if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + free_head_ref_squota_rsv(trans->fs_info, href); + } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, node, extent_op, + ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, node, extent_op, + ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); + else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) + ret = 0; else BUG(); if (ret && insert_reserved) @@ -1783,28 +1882,38 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - int nr_items = 1; /* Dropping this ref head update. */ + u64 ret = 0; /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. */ if (head->total_ref_mod < 0 && head->is_data) { + int nr_csums; + spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + + btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); + + ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } + /* must_insert_reserved can be set only if we didn't run the head ref. */ + if (head->must_insert_reserved) + free_head_ref_squota_rsv(fs_info, head); - btrfs_delayed_refs_rsv_release(fs_info, nr_items); + return ret; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) + struct btrfs_delayed_ref_head *head, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1849,7 +1958,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -1891,7 +2000,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref) + struct btrfs_delayed_ref_head *locked_ref, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; @@ -1939,14 +2049,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; + /* + * Unsetting this on the head ref relinquishes ownership of + * the rsv_bytes, so it is critical that every possible code + * path from here forward frees all reserves including qgroup + * reserve. + */ locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); - ret = run_one_delayed_ref(trans, ref, extent_op, + ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, must_insert_reserved); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); + *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); btrfs_free_delayed_extent_op(extent_op); if (ret) { @@ -1970,15 +2088,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) + u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; unsigned long count = 0; + unsigned long max_count = 0; + u64 bytes_processed = 0; delayed_refs = &trans->transaction->delayed_refs; + if (min_bytes == 0) { + max_count = delayed_refs->num_heads_ready; + min_bytes = U64_MAX; + } + do { if (!locked_ref) { locked_ref = btrfs_obtain_ref_head(trans); @@ -2006,7 +2131,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2018,7 +2143,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * Success, perform the usual cleanup of a processed * head */ - ret = cleanup_ref_head(trans, locked_ref); + ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; @@ -2035,7 +2160,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, locked_ref = NULL; cond_resched(); - } while ((nr != -1 && count < nr) || locked_ref); + } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || + (max_count > 0 && count < max_count) || + locked_ref); return 0; } @@ -2084,24 +2211,25 @@ static u64 find_middle(struct rb_root *root) #endif /* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. + * Start processing the delayed reference count updates and extent insertions + * we have queued up so far. + * + * @trans: Transaction handle. + * @min_bytes: How many bytes of delayed references to process. After this + * many bytes we stop processing delayed references if there are + * any more. If 0 it means to run all existing delayed references, + * but not new ones added after running all existing ones. + * Use (u64)-1 (U64_MAX) to run all existing delayed references + * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count) +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *head; int ret; - int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ if (TRANS_ABORTED(trans)) @@ -2111,42 +2239,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, return 0; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) - count = delayed_refs->num_heads_ready; - again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - ret = __btrfs_run_delayed_refs(trans, count); + ret = __btrfs_run_delayed_refs(trans, min_bytes); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; } - if (run_all) { + if (min_bytes == U64_MAX) { btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first_cached(&delayed_refs->href_root); - if (!node) { + if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { spin_unlock(&delayed_refs->lock); - goto out; + return 0; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); - /* Mutex was contended, block until it's released and retry. */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - - btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } -out: + return 0; } @@ -2271,6 +2387,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; + u32 expected_size; int type; int ret; @@ -2297,10 +2414,22 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); + + /* No inline refs; we need to bail before checking for owner ref. */ + if (item_size == sizeof(*ei)) + goto out; + + /* Check for an owner ref; skip over it to the real inline refs. */ + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { + expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + iref = (struct btrfs_extent_inline_ref *)(iref + 1); + } /* If extent item has more than 1 inline ref then it's shared */ - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + if (item_size != expected_size) goto out; /* @@ -2312,8 +2441,6 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_root_last_snapshot(&root->root_item))) goto out; - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - /* If this extent has SHARED_DATA_REF then it's shared */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) @@ -2410,7 +2537,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); + num_bytes, parent, ref_root); btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, key.offset, root->root_key.objectid, for_reloc); @@ -2423,8 +2550,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; + /* We don't know the owning_root, use 0. */ btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); + num_bytes, parent, 0); btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, root->root_key.objectid, for_reloc); if (inc) @@ -2525,16 +2653,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, return 0; } -/* - * this function must be called within transaction - */ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes) + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(trans->fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) return -EINVAL; @@ -2546,10 +2671,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, eb->start, eb->len, 0); /* remove us from the free space cache (if we're there at all) */ - ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, eb->start, eb->len); out: btrfs_put_block_group(cache); return ret; @@ -2751,9 +2876,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } @@ -2805,12 +2929,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Parse an extent item's inline extents looking for a simple quotas owner ref. + * + * @fs_info: the btrfs_fs_info for this mount + * @leaf: a leaf in the extent tree containing the extent item + * @slot: the slot in the leaf where the extent item is found + * + * Returns the objectid of the root that originally allocated the extent item + * if the inline owner ref is expected and present, otherwise 0. + * + * If an extent item has an owner ref item, it will be the first inline ref + * item. Therefore the logic is to check whether there are any inline ref + * items, then check the type of the first one. + */ +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_owner_ref *oref; + unsigned long ptr; + unsigned long end; + int type; + + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) + return 0; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + btrfs_item_size(leaf, slot); + + /* No inline ref items of any kind, can't check type. */ + if (ptr == end) + return 0; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + + /* We found an owner ref, get the root out of it. */ + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + return btrfs_extent_owner_ref_root_id(leaf, oref); + } + + /* We have inline refs, but not an owner ref. */ + return 0; +} + static int do_free_extent_accounting(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, bool is_data) + u64 bytenr, struct btrfs_squota_delta *delta) { int ret; + u64 num_bytes = delta->num_bytes; - if (is_data) { + if (delta->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(trans->fs_info, bytenr); @@ -2819,6 +2992,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } + + ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = btrfs_record_squota_delta(trans->fs_info, delta); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; } ret = add_to_free_space_tree(trans, bytenr, num_bytes); @@ -2901,9 +3086,10 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + u64 owner_offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -2918,11 +3104,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int refs_to_drop = node->ref_mod; u32 item_size; u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + u64 delayed_ref_root = href->owning_root; extent_root = btrfs_extent_root(info, bytenr); ASSERT(extent_root); @@ -3059,8 +3247,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { - ret = -EINVAL; - btrfs_print_v0_err(info); + ret = -EUCLEAN; + btrfs_err(trans->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } @@ -3110,7 +3300,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -3121,6 +3311,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + struct btrfs_squota_delta delta = { + .root = delayed_ref_root, + .num_bytes = num_bytes, + .is_data = is_data, + .is_inc = false, + .generation = btrfs_extent_generation(leaf, ei), + }; + /* In this branch refs == 1 */ if (found_extent) { if (is_data && refs_to_drop != @@ -3159,6 +3357,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, num_to_del = 2; } } + /* + * We can't infer the data owner from the delayed ref, so we need + * to try to get it from the owning ref item. + * + * If it is not present, then that extent was not written under + * simple quotas mode, so we don't need to account for its deletion. + */ + if (is_data) + delta.root = btrfs_get_extent_owner_root(trans->fs_info, + leaf, extent_slot); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); @@ -3168,7 +3376,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); + ret = do_free_extent_accounting(trans, bytenr, &delta); } btrfs_release_path(path); @@ -3242,7 +3450,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent); + buf->start, buf->len, parent, btrfs_header_owner(buf)); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), root_id, 0, false); @@ -3329,10 +3537,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { - /* unlocks the pinned mutex */ + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { @@ -3342,20 +3549,47 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); return ret; } enum btrfs_loop_type { + /* + * Start caching block groups but do not wait for progress or for them + * to be done. + */ LOOP_CACHING_NOWAIT, + + /* + * Wait for the block group free_space >= the space we're waiting for if + * the block group isn't cached. + */ LOOP_CACHING_WAIT, + + /* + * Allow allocations to happen from block groups that do not yet have a + * size classification. + */ LOOP_UNSET_SIZE_CLASS, + + /* + * Allocate a chunk and then retry the allocation. + */ LOOP_ALLOC_CHUNK, + + /* + * Ignore the size class restrictions for this allocation. + */ LOOP_WRONG_SIZE_CLASS, + + /* + * Ignore the empty size, only try to allocate the number of bytes + * needed for this allocation. + */ LOOP_NO_EMPTY_SIZE, }; @@ -3427,7 +3661,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. - * Return -EAGAIN to inform caller that we need to re-search this block group * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ @@ -3508,14 +3741,6 @@ refill_cluster: trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } - } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && - !ffe_ctl->retry_clustered) { - spin_unlock(&last_ptr->refill_lock); - - ffe_ctl->retry_clustered = true; - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_cluster + ffe_ctl->empty_size); - return -EAGAIN; } /* * At this point we either didn't find a cluster or we weren't able to @@ -3530,7 +3755,6 @@ refill_cluster: /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset - * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) @@ -3568,25 +3792,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, ffe_ctl->num_bytes, ffe_ctl->empty_size, &ffe_ctl->max_extent_size); - - /* - * If we didn't find a chunk, and we haven't failed on this block group - * before, and this block group is in the middle of caching and we are - * ok with waiting, then go ahead and wait for progress to be made, and - * set @retry_unclustered to true. - * - * If @retry_unclustered is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && - ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); - ffe_ctl->retry_unclustered = true; - return -EAGAIN; - } else if (!offset) { + if (!offset) return 1; - } ffe_ctl->found_offset = offset; return 0; } @@ -3600,7 +3807,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); - if (ret >= 0 || ret == -EAGAIN) + if (ret >= 0) return ret; /* ret == -ENOENT case falls through */ } @@ -3685,7 +3892,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, } spin_unlock(&block_group->lock); - if (!ret && !btrfs_zone_activate(block_group)) { + /* Metadata block group is activated at write time. */ + if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !btrfs_zone_activate(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3709,7 +3918,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, fs_info->data_reloc_bg == 0); if (block_group->ro || - test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + (!ffe_ctl->for_data_reloc && + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { ret = 1; goto out; } @@ -3752,8 +3962,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; - if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) - fs_info->data_reloc_bg = block_group->start; + if (ffe_ctl->for_data_reloc) { + if (!fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + /* + * Do not allow allocations from this block group, unless it is + * for data relocation. Compared to increasing the ->ro, setting + * the ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + * + * Also, this flag avoids this block group to be zone finished. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + } ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; @@ -3771,24 +3999,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc && - fs_info->data_reloc_bg == block_group->start) { - /* - * Do not allow further allocations from this block group. - * Compared to increasing the ->ro, setting the - * ->zoned_data_reloc_ongoing flag still allows nocow - * writers to come in. See btrfs_inc_nocow_writers(). - * - * We need to disable an allocation to avoid an allocation of - * regular (non-relocation data) extent. With mix of relocation - * extents and regular extents, we can dispatch WRITE commands - * (for relocation extents) and ZONE APPEND commands (for - * regular extents) at the same time to the same zone, which - * easily break the write pointer. - */ - set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0; - } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3816,8 +4028,7 @@ static void release_block_group(struct btrfs_block_group *block_group, { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ @@ -3861,6 +4072,10 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { + /* Block group's activeness is not a requirement for METADATA block groups. */ + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + /* If we can activate new zone, just allocate a chunk and use it */ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) return 0; @@ -3949,15 +4164,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) return 1; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_UNSET_SIZE_CLASS, allow unset size class - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ + /* See the comments for btrfs_loop_type for an explanation of the phases. */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; /* @@ -4168,9 +4375,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->orig_have_caching_bg = false; ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); ffe_ctl->loop = 0; - /* For clustered allocation */ - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; ffe_ctl->cached = 0; ffe_ctl->max_extent_size = 0; ffe_ctl->total_free_space = 0; @@ -4321,16 +4526,12 @@ have_block_group: bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); - if (ret == 0) { - if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, - ffe_ctl->delalloc); - block_group = bg_ret; - } - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { + if (ret > 0) goto loop; + + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + block_group = bg_ret; } /* Checks */ @@ -4371,6 +4572,15 @@ have_block_group: btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: + if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_uncached) { + ffe_ctl->retry_uncached = true; + btrfs_wait_block_group_cache_progress(block_group, + ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + + ffe_ctl->empty_size); + goto have_block_group; + } release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } @@ -4398,8 +4608,8 @@ loop: } /* - * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a - * hole that is at least as big as @num_bytes. + * Entry point to the extent allocator. Tries to find a hole that is at least + * as big as @num_bytes. * * @root - The root that will contain this extent * @@ -4518,20 +4728,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(trans->fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) { btrfs_err(trans->fs_info, "unable to find block group for %llu", - start); + eb->start); return -ENOSPC; } - ret = pin_down_extent(trans, cache, start, len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); btrfs_put_block_group(cache); return ret; } @@ -4561,24 +4771,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) + struct btrfs_key *ins, int ref_mod, u64 oref_root) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; + const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + size = sizeof(*extent_item); + if (simple_quota) + size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) @@ -4600,7 +4815,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + if (simple_quota) { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); + iref = (struct btrfs_extent_inline_ref *)(oref + 1); + } btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); @@ -4615,7 +4837,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4690,7 +4912,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -4702,12 +4924,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; + u64 root_objectid = root->root_key.objectid; + u64 owning_root = root_objectid; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) + owning_root = root->relocation_src_root; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + ins->objectid, ins->offset, 0, owning_root); + btrfs_init_data_ref(&generic_ref, root_objectid, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); @@ -4727,6 +4954,13 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; + struct btrfs_squota_delta delta = { + .root = root_objectid, + .num_bytes = ins->offset, + .generation = trans->transid, + .is_data = true, + .is_inc = true, + }; /* * Mixed block groups will exclude before processing the log so we only @@ -4752,13 +4986,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, spin_unlock(&space_info->lock); ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, - offset, ins, 1); + offset, ins, 1, root_objectid); if (ret) btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; } +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extra safety check in case the extent tree is corrupted and extent allocator + * chooses to use a tree block which is already used and locked. + */ +static bool check_eb_lock_owner(const struct extent_buffer *eb) +{ + if (eb->lock_owner == current->pid) { + btrfs_err_rl(eb->fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + eb->start, btrfs_header_owner(eb), current->pid); + return true; + } + return false; +} +#else +static bool check_eb_lock_owner(struct extent_buffer *eb) +{ + return false; +} +#endif + static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, @@ -4772,15 +5029,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - /* - * Extra safety check in case the extent tree is corrupted and extent - * allocator chooses to use a tree block which is already used and - * locked. - */ - if (buf->lock_owner == current->pid) { - btrfs_err_rl(fs_info, -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", - buf->start, btrfs_header_owner(buf), current->pid); + if (check_eb_lock_owner(buf)) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -4857,6 +5106,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4869,6 +5119,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, int ret; u32 blocksize = fs_info->nodesize; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + u64 owning_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { @@ -4895,11 +5146,13 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = PTR_ERR(buf); goto out_free_reserved; } + owning_root = btrfs_header_owner(buf); if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + owning_root = reloc_src_root; } else BUG_ON(parent > 0); @@ -4919,7 +5172,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent); + ins.objectid, ins.offset, parent, owning_root); btrfs_init_tree_ref(&generic_ref, level, root_objectid, root->root_key.objectid, false); btrfs_ref_tree_mod(fs_info, &generic_ref); @@ -5007,7 +5260,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, - &flags); + &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -5074,7 +5327,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); BUG_ON(ret == -ENOMEM); if (ret) return ret; @@ -5164,6 +5418,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + u64 owner_root = 0; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; struct btrfs_ref ref = { 0 }; @@ -5207,7 +5462,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], - &wc->flags[level - 1]); + &wc->flags[level - 1], + &owner_root); if (ret < 0) goto out_unlock; @@ -5340,7 +5596,7 @@ skip: find_next_key(path, level, &wc->drop_progress); btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent); + fs_info->nodesize, parent, owner_root); btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 0, false); ret = btrfs_free_extent(trans, &ref); @@ -5407,7 +5663,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5652,7 +5909,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], NULL); if (ret < 0) { err = ret; goto out_end_trans; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 429d5c570061..2e066035ccee 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -7,6 +7,7 @@ #include "block-group.h" struct btrfs_free_cluster; +struct btrfs_delayed_ref_head; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, @@ -48,16 +49,11 @@ struct find_free_extent_ctl { int loop; /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; + bool retry_uncached; /* If current block group is cached */ int cached; @@ -96,21 +92,19 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes); +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owner_root); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); + const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr, bool strict, @@ -121,6 +115,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, @@ -144,12 +139,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 90ad3006ef3a..8f724c54fc8e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -21,7 +21,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "bio.h" -#include "check-integrity.h" #include "locking.h" #include "rcu-string.h" #include "backref.h" @@ -181,34 +180,9 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) } } -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - - while (index <= end_index) { - folio = filemap_get_folio(mapping, index); - filemap_dirty_folio(mapping, folio); - folio_account_redirty(folio); - index += folio_nr_pages(folio); - folio_put(folio); - } -} - -/* - * Process one page for __process_pages_contig(). - * - * Return >0 if we hit @page == @locked_page. - * Return 0 if we updated the page status. - * Return -EGAIN if the we need to try again. - * (For PAGE_LOCK case but got dirty page or page not belong to mapping) - */ -static int process_one_page(struct btrfs_fs_info *fs_info, - struct address_space *mapping, - struct page *page, struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_page(struct btrfs_fs_info *fs_info, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) { u32 len; @@ -224,94 +198,36 @@ static int process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_page_clamp_clear_writeback(fs_info, page, start, len); - if (page == locked_page) - return 1; - - if (page_ops & PAGE_LOCK) { - int ret; - - ret = btrfs_page_start_writer_lock(fs_info, page, start, len); - if (ret) - return ret; - if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, len); - return -EAGAIN; - } - } - if (page_ops & PAGE_UNLOCK) + if (page != locked_page && (page_ops & PAGE_UNLOCK)) btrfs_page_end_writer_lock(fs_info, page, start, len); - return 0; } -static int __process_pages_contig(struct address_space *mapping, - struct page *locked_page, - u64 start, u64 end, unsigned long page_ops, - u64 *processed_end) +static void __process_pages_contig(struct address_space *mapping, + struct page *locked_page, u64 start, u64 end, + unsigned long page_ops) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long pages_processed = 0; struct folio_batch fbatch; - int err = 0; int i; - if (page_ops & PAGE_LOCK) { - ASSERT(page_ops == PAGE_LOCK); - ASSERT(processed_end && *processed_end == start); - } - folio_batch_init(&fbatch); while (index <= end_index) { int found_folios; found_folios = filemap_get_folios_contig(mapping, &index, end_index, &fbatch); - - if (found_folios == 0) { - /* - * Only if we're going to lock these pages, we can find - * nothing at @index. - */ - ASSERT(page_ops & PAGE_LOCK); - err = -EAGAIN; - goto out; - } - for (i = 0; i < found_folios; i++) { - int process_ret; struct folio *folio = fbatch.folios[i]; - process_ret = process_one_page(fs_info, mapping, - &folio->page, locked_page, page_ops, - start, end); - if (process_ret < 0) { - err = -EAGAIN; - folio_batch_release(&fbatch); - goto out; - } - pages_processed += folio_nr_pages(folio); + + process_one_page(fs_info, &folio->page, locked_page, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); } -out: - if (err && processed_end) { - /* - * Update @processed_end. I know this is awful since it has - * two different return value patterns (inclusive vs exclusive). - * - * But the exclusive pattern is necessary if @start is 0, or we - * underflow and check against processed_end won't work as - * expected. - */ - if (pages_processed) - *processed_end = min(end, - ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); - else - *processed_end = start; - } - return err; } static noinline void __unlock_for_delalloc(struct inode *inode, @@ -326,29 +242,63 @@ static noinline void __unlock_for_delalloc(struct inode *inode, return; __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK, NULL); + PAGE_UNLOCK); } static noinline int lock_delalloc_pages(struct inode *inode, struct page *locked_page, - u64 delalloc_start, - u64 delalloc_end) + u64 start, + u64 end) { - unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long end_index = delalloc_end >> PAGE_SHIFT; - u64 processed_end = delalloc_start; - int ret; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + u64 processed_end = start; + struct folio_batch fbatch; - ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, - delalloc_end, PAGE_LOCK, &processed_end); - if (ret == -EAGAIN && processed_end > delalloc_start) - __unlock_for_delalloc(inode, locked_page, delalloc_start, - processed_end); - return ret; + folio_batch_init(&fbatch); + while (index <= end_index) { + unsigned int found_folios, i; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + if (found_folios == 0) + goto out; + + for (i = 0; i < found_folios; i++) { + struct page *page = &fbatch.folios[i]->page; + u32 len = end + 1 - start; + + if (page == locked_page) + continue; + + if (btrfs_page_start_writer_lock(fs_info, page, start, + len)) + goto out; + + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, + len); + goto out; + } + + processed_end = page_offset(page) + PAGE_SIZE - 1; + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return 0; +out: + folio_batch_release(&fbatch); + if (processed_end > start) + __unlock_for_delalloc(inode, locked_page, start, processed_end); + return -EAGAIN; } /* @@ -444,7 +394,7 @@ again: /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1, cached_state); + EXTENT_DELALLOC, cached_state); if (!ret) { unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); @@ -467,7 +417,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops, NULL); + start, end, page_ops); } static bool btrfs_verify_page(struct page *page, u64 start) @@ -497,31 +447,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -/* lots and lots of room for performance fixes in the end_bio funcs */ - -void end_extent_writepage(struct page *page, int err, u64 start, u64 end) -{ - struct btrfs_inode *inode; - const bool uptodate = (err == 0); - int ret = 0; - - ASSERT(page && page->mapping); - inode = BTRFS_I(page->mapping->host); - btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); - - if (!uptodate) { - const struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - - btrfs_page_clear_uptodate(fs_info, page, start, len); - ret = err < 0 ? err : -EIO; - mapping_set_error(page->mapping, ret); - } -} - /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -558,10 +483,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) bvec->bv_offset, bvec->bv_len); btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); - if (error) { - btrfs_page_clear_uptodate(fs_info, page, start, len); + if (error) mapping_set_error(page->mapping, error); - } btrfs_page_clear_writeback(fs_info, page, start, len); } @@ -751,8 +674,8 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * the array will be skipped * * Return: 0 if all pages were able to be allocated; - * -ENOMEM otherwise, and the caller is responsible for freeing all - * non-null page pointers in the array. + * -ENOMEM otherwise, the partially allocated pages would be freed and + * the array slots zeroed */ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) { @@ -771,8 +694,13 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) * though alloc_pages_bulk_array() falls back to alloc_page() * if it could not bulk-allocate. So we must be out of memory. */ - if (allocated == last) + if (allocated == last) { + for (int i = 0; i < allocated; i++) { + __free_page(page_array[i]); + page_array[i] = NULL; + } return -ENOMEM; + } memalloc_retry_wait(GFP_NOFS); } @@ -1243,38 +1171,45 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc) { - const u64 page_end = page_offset(page) + PAGE_SIZE - 1; - u64 delalloc_start = page_offset(page); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; + u64 delalloc_start = page_start; + u64 delalloc_end = page_end; u64 delalloc_to_write = 0; - /* How many pages are started by btrfs_run_delalloc_range() */ - unsigned long nr_written = 0; - int ret; - int page_started = 0; + int ret = 0; while (delalloc_start < page_end) { - u64 delalloc_end = page_end; - bool found; - - found = find_lock_delalloc_range(&inode->vfs_inode, page, - &delalloc_start, - &delalloc_end); - if (!found) { + delalloc_end = page_end; + if (!find_lock_delalloc_range(&inode->vfs_inode, page, + &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; } + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, &page_started, &nr_written, wbc); - if (ret) + delalloc_end, wbc); + if (ret < 0) return ret; - /* - * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_SIZE) >> PAGE_SHIFT; delalloc_start = delalloc_end + 1; } + + /* + * delalloc_end is already one less than the total length, so + * we don't subtract one from PAGE_SIZE + */ + delalloc_to_write += + DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); + + /* + * If btrfs_run_dealloc_range() already started I/O and unlocked + * the pages, we just need to account for them here. + */ + if (ret == 1) { + wbc->nr_to_write -= delalloc_to_write; + return 1; + } + if (wbc->nr_to_write < delalloc_to_write) { int thresh = 8192; @@ -1284,16 +1219,6 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, thresh); } - /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ - if (page_started) { - /* - * We've unlocked the page, so we can't update the mapping's - * writeback index, just update nr_to_write. - */ - wbc->nr_to_write -= nr_written; - return 1; - } - return 0; } @@ -1382,6 +1307,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { + u32 len = end - cur + 1; u64 disk_bytenr; u64 em_end; u64 dirty_range_start = cur; @@ -1389,8 +1315,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(inode, page, cur, - end, true); + btrfs_mark_ordered_io_finished(inode, page, cur, len, + true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1399,7 +1325,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); + btrfs_page_clear_dirty(fs_info, page, cur, len); break; } @@ -1410,7 +1336,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); + em = btrfs_get_extent(inode, NULL, 0, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; @@ -1486,7 +1412,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; @@ -1530,8 +1455,11 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, page_start, page_end); + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, + PAGE_SIZE, !ret); + mapping_set_error(page->mapping, ret); + } unlock_page(page); ASSERT(ret <= 0); return ret; @@ -1696,8 +1624,6 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio) struct page *page = bvec->bv_page; u32 len = bvec->bv_len; - if (!uptodate) - btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_clear_writeback(fs_info, page, start, len); bio_offset += len; } @@ -1877,11 +1803,10 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_buffer **eb_context) +static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { + struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; - struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -1908,7 +1833,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; } - if (eb == *eb_context) { + if (eb == ctx->eb) { spin_unlock(&mapping->private_lock); return 0; } @@ -1917,34 +1842,25 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; - if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { - /* - * If for_sync, this hole will be filled with - * trasnsaction commit. - */ - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) - ret = -EAGAIN; - else + ctx->eb = eb; + + ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); + if (ret) { + if (ret == -EBUSY) ret = 0; free_extent_buffer(eb); return ret; } - *eb_context = eb; - if (!lock_extent_buffer_for_io(eb, wbc)) { - btrfs_revert_meta_write_pointer(cache, eb); - if (cache) - btrfs_put_block_group(cache); free_extent_buffer(eb); return 0; } - if (cache) { - /* - * Implies write in zoned mode. Mark the last eb in a block group. - */ - btrfs_schedule_zone_finish_bg(cache, eb); - btrfs_put_block_group(cache); + /* Implies write in zoned mode. */ + if (ctx->zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); + ctx->zoned_bg->meta_write_pointer += eb->len; } write_one_eb(eb, wbc); free_extent_buffer(eb); @@ -1954,7 +1870,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_buffer *eb_context = NULL; + struct btrfs_eb_write_context ctx = { .wbc = wbc }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; @@ -1996,7 +1912,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, wbc, &eb_context); + ret = submit_eb_page(&folio->page, &ctx); if (ret == 0) continue; if (ret < 0) { @@ -2057,6 +1973,9 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; + + if (ctx.zoned_bg) + btrfs_put_block_group(ctx.zoned_bg); btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -2150,7 +2069,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - done_index = folio->index + folio_nr_pages(folio); + done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -2233,11 +2152,11 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc) +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty) { bool found_error = false; - int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2256,18 +2175,16 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + u32 cur_len = cur_end + 1 - cur; struct page *page; int nr = 0; page = find_get_page(mapping, cur >> PAGE_SHIFT); - /* - * All pages in the range are locked since - * btrfs_run_delalloc_range(), thus there is no way to clear - * the page dirty flag. - */ ASSERT(PageLocked(page)); - ASSERT(PageDirty(page)); - clear_page_dirty_for_io(page); + if (pages_dirty && page != locked_page) { + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + } ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, i_size, &nr); @@ -2279,23 +2196,20 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, cur, cur_end); - btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur); - if (ret < 0) { - found_error = true; - first_error = ret; + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + cur, cur_len, !ret); + mapping_set_error(page->mapping, ret); } + btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + if (ret < 0) + found_error = true; next_page: put_page(page); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); - - if (found_error) - return first_error; - return ret; } int extent_writepages(struct address_space *mapping, @@ -2384,11 +2298,12 @@ static int try_release_extent_state(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret = 1; - if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { + if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { ret = 0; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | + EXTENT_QGROUP_RESERVED); /* * At this point we can safely clear everything except the @@ -2443,9 +2358,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) free_extent_map(em); break; } - if (test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED, 0, NULL)) + if (test_range_bit_exists(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used @@ -3315,8 +3230,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } WARN_ON(PageDirty(p)); - copy_page(page_address(p), page_address(src->pages[i])); } + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; @@ -3545,6 +3460,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) start, fs_info->nodesize); return -EINVAL; } + if (!IS_ALIGNED(start, fs_info->nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + btrfs_warn(fs_info, +"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", + start, fs_info->nodesize); + } return 0; } @@ -3559,6 +3480,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *exists = NULL; struct page *p; struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; int uptodate = 1; int ret; @@ -3595,36 +3517,30 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++, index++) { - struct btrfs_subpage *prealloc = NULL; + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock nor page lock hold. + * + * The memory will be freed by attach_extent_buffer_page() or freed + * manually if we exit earlier. + */ + if (fs_info->nodesize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + exists = ERR_CAST(prealloc); + goto free_eb; + } + } + + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); + btrfs_free_subpage(prealloc); goto free_eb; } - /* - * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock hold. The memory will be - * freed by attach_extent_buffer_page() or freed manually if - * we exit earlier. - * - * Although we have ensured one subpage eb can only have one - * page, but it may change in the future for 16K page size - * support, so we still preallocate the memory in the loop. - */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); - if (IS_ERR(prealloc)) { - ret = PTR_ERR(prealloc); - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; - } - } - spin_lock(&mapping->private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { @@ -4090,8 +4006,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = get_eb_page_index(start); - if (check_eb_range(eb, start, len)) + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); return; + } offset = get_eb_offset_in_page(eb, start); @@ -4210,30 +4132,9 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, } } -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, - chunk_tree_uuid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - -void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, - unsigned long start, unsigned long len) +static void __write_extent_buffer(const struct extent_buffer *eb, + const void *srcv, unsigned long start, + unsigned long len, bool use_memmove) { size_t cur; size_t offset; @@ -4241,6 +4142,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *kaddr; char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + /* For unmapped (dummy) ebs, no need to check their uptodate status. */ + const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); @@ -4251,11 +4154,15 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, while (len > 0) { page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); + if (check_uptodate) + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); - memcpy(kaddr + offset, src, cur); + if (use_memmove) + memmove(kaddr + offset, src, cur); + else + memcpy(kaddr + offset, src, cur); src += cur; len -= cur; @@ -4264,55 +4171,54 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, } } -void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, - unsigned long len) +void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) { - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - unsigned long i = get_eb_page_index(start); + return __write_extent_buffer(eb, srcv, start, len, false); +} - if (check_eb_range(eb, start, len)) - return; +static void memset_extent_buffer(const struct extent_buffer *eb, int c, + unsigned long start, unsigned long len) +{ + unsigned long cur = start; - offset = get_eb_offset_in_page(eb, start); + while (cur < start + len) { + unsigned long index = get_eb_page_index(cur); + unsigned int offset = get_eb_offset_in_page(eb, cur); + unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); + struct page *page = eb->pages[index]; - while (len > 0) { - page = eb->pages[i]; assert_eb_page_uptodate(eb, page); + memset(page_address(page) + offset, c, cur_len); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); - memset(kaddr + offset, 0, cur); - - len -= cur; - offset = 0; - i++; + cur += cur_len; } } +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + if (check_eb_range(eb, start, len)) + return; + return memset_extent_buffer(eb, 0, start, len); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - int i; - int num_pages; + unsigned long cur = 0; ASSERT(dst->len == src->len); - if (dst->fs_info->nodesize >= PAGE_SIZE) { - num_pages = num_extent_pages(dst); - for (i = 0; i < num_pages; i++) - copy_page(page_address(dst->pages[i]), - page_address(src->pages[i])); - } else { - size_t src_offset = get_eb_offset_in_page(src, 0); - size_t dst_offset = get_eb_offset_in_page(dst, 0); + while (cur < src->len) { + unsigned long index = get_eb_page_index(cur); + unsigned long offset = get_eb_offset_in_page(src, cur); + unsigned long cur_len = min(src->len, PAGE_SIZE - offset); + void *addr = page_address(src->pages[index]) + offset; - ASSERT(src->fs_info->nodesize < PAGE_SIZE); - memcpy(page_address(dst->pages[0]) + dst_offset, - page_address(src->pages[0]) + src_offset, - src->len); + write_extent_buffer(dst, addr, cur, cur_len); + + cur += cur_len; } } @@ -4353,14 +4259,14 @@ void copy_extent_buffer(const struct extent_buffer *dst, } /* - * eb_bitmap_offset() - calculate the page and offset of the byte containing the - * given bit number - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number - * @page_index: return index of the page in the extent buffer that contains the - * given bit number - * @page_offset: return offset into the page given by page_index + * Calculate the page and offset of the byte containing the given bit number. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains + * the given bit number + * @page_offset: return offset into the page given by page_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. @@ -4406,6 +4312,15 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } +static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) +{ + unsigned long index = get_eb_page_index(bytenr); + + if (check_eb_range(eb, bytenr, 1)) + return NULL; + return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); +} + /* * Set an area of a bitmap to 1. * @@ -4417,35 +4332,28 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_set) { - kaddr[offset] |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_set &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] |= mask_to_set; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr |= mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); } @@ -4461,35 +4369,28 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_clear) { - kaddr[offset] &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] &= ~mask_to_clear; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr &= ~mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -4498,60 +4399,29 @@ static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned return distance < len; } -static void copy_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - char *src_kaddr; - int must_memmove = 0; - - if (dst_page != src_page) { - src_kaddr = page_address(src_page); - } else { - src_kaddr = dst_kaddr; - if (areas_overlap(src_off, dst_off, len)) - must_memmove = 1; - } - - if (must_memmove) - memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); - else - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); -} - void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - unsigned long dst_i; - unsigned long src_i; + unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; - while (len > 0) { - dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); - src_off_in_page = get_eb_offset_in_page(dst, src_offset); - - dst_i = get_eb_page_index(dst_offset); - src_i = get_eb_page_index(src_offset); - - cur = min(len, (unsigned long)(PAGE_SIZE - - src_off_in_page)); - cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_SIZE - dst_off_in_page)); - - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page, src_off_in_page, cur); - - src_offset += cur; - dst_offset += cur; - len -= cur; + while (cur_off < len) { + unsigned long cur_src = cur_off + src_offset; + unsigned long pg_index = get_eb_page_index(cur_src); + unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long cur_len = min(src_offset + len - cur_src, + PAGE_SIZE - pg_off); + void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + const bool use_memmove = areas_overlap(src_offset + cur_off, + dst_offset + cur_off, cur_len); + + __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, + use_memmove); + cur_off += cur_len; } } @@ -4559,23 +4429,26 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - unsigned long dst_i; - unsigned long src_i; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } + while (len > 0) { - dst_i = get_eb_page_index(dst_end); + unsigned long src_i; + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + void *src_addr; + bool use_memmove; + src_i = get_eb_page_index(src_end); dst_off_in_page = get_eb_offset_in_page(dst, dst_end); @@ -4583,9 +4456,14 @@ void memmove_extent_buffer(const struct extent_buffer *dst, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page - cur + 1, - src_off_in_page - cur + 1, cur); + + src_addr = page_address(dst->pages[src_i]) + src_off_in_page - + cur + 1; + use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, + cur); + + __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, + use_memmove); dst_end -= cur; src_end -= cur; @@ -4747,7 +4625,8 @@ int try_release_extent_buffer(struct page *page) } /* - * btrfs_readahead_tree_block - attempt to readahead a child block + * Attempt to readahead a child block. + * * @fs_info: the fs_info * @bytenr: bytenr to read * @owner_root: objectid of the root that owns this eb @@ -4786,7 +4665,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, } /* - * btrfs_readahead_node_child - readahead a node's child block + * Readahead a node's child block. + * * @node: parent node we're reading from * @slot: slot in the parent node for the child we want to read * diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c5fae3a7d911..2171057a4477 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -40,7 +40,6 @@ enum { ENUM_BIT(PAGE_START_WRITEBACK), ENUM_BIT(PAGE_END_WRITEBACK), ENUM_BIT(PAGE_SET_ORDERED), - ENUM_BIT(PAGE_LOCK), }; /* @@ -81,19 +80,26 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; int read_mirror; - struct rcu_head rcu_head; - pid_t lock_owner; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + struct rcu_head rcu_head; struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; + pid_t lock_owner; #endif }; +struct btrfs_eb_write_context { + struct writeback_control *wbc; + struct extent_buffer *eb; + /* Block group @eb resides in. Only used for zoned mode. */ + struct btrfs_block_group *zoned_bg; +}; + /* * Get the correct offset inside the page of extent buffer. * @@ -178,8 +184,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc); +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -236,11 +243,24 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dst, int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *src); void write_extent_buffer(const struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); + +static inline void write_extent_buffer_chunk_tree_uuid( + const struct extent_buffer *eb, const void *chunk_tree_uuid) +{ + write_extent_buffer(eb, chunk_tree_uuid, + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_FSID_SIZE); +} + +static inline void write_extent_buffer_fsid(const struct extent_buffer *eb, + const void *fsid) +{ + write_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src); void copy_extent_buffer(const struct extent_buffer *dst, @@ -266,7 +286,6 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); @@ -277,8 +296,6 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); -void end_extent_writepage(struct page *page, int err, u64 start, u64 end); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 696bf695d8eb..45cae356e89b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -194,7 +194,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -597,29 +597,37 @@ fail: * Each bit represents a sector. Thus caller should ensure @csum_buf passed * in is large enough to contain all csums. */ -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit) +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_csum_item *item; const u64 orig_start = start; + bool free_path = false; int ret; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + free_path = true; + } - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; + /* Check if we can reuse the previous path. */ + if (path->nodes[0]) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY && + key.offset <= start) + goto search_forward; + btrfs_release_path(path); } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -656,6 +664,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } } +search_forward: while (start <= end) { u64 csum_end; @@ -712,7 +721,8 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } ret = 0; fail: - btrfs_free_path(path); + if (free_path) + btrfs_free_path(path); return ret; } @@ -801,11 +811,12 @@ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) * This calls btrfs_truncate_item with the correct args based on the overlap, * and fixes up the key as required. */ -static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, +static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *key, u64 bytenr, u64 len) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; const u32 csum_size = fs_info->csum_size; u64 csum_end; @@ -826,7 +837,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(path, new_size, 1); + btrfs_truncate_item(trans, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -838,10 +849,10 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(path, new_size, 0); + btrfs_truncate_item(trans, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(fs_info, path, key); + btrfs_set_item_key_safe(trans, path, key); } else { BUG(); } @@ -984,7 +995,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; } else { - truncate_one_csum(fs_info, path, &key, bytenr, len); + truncate_one_csum(trans, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -1192,7 +1203,7 @@ extend_csum: diff /= csum_size; diff *= csum_size; - btrfs_extend_item(path, diff); + btrfs_extend_item(trans, path, diff); ret = 0; goto csum; } @@ -1239,7 +1250,7 @@ found: ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 4ec669b69008..04bd2d34efb1 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -57,9 +57,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fd03e689a6be..32611a4edd6b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -17,6 +17,7 @@ #include <linux/uio.h> #include <linux/iversion.h> #include <linux/fsverity.h> +#include <linux/iomap.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -368,12 +369,13 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0); + disk_bytenr, num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, @@ -405,13 +407,13 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = args->end; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -431,7 +433,7 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -463,7 +465,8 @@ delete_extent_item: } else if (update_refs && disk_bytenr > 0) { btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0); + disk_bytenr, num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, key.objectid, @@ -536,7 +539,8 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size); + btrfs_setup_item_for_insert(trans, root, path, &key, + args->extent_item_size); args->extent_inserted = true; } @@ -593,7 +597,6 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_path *path; @@ -664,7 +667,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -679,7 +682,7 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -698,7 +701,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -708,7 +711,7 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -742,10 +745,10 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0); + num_bytes, 0, root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); @@ -771,7 +774,7 @@ again: other_start = end; other_end = 0; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0); + num_bytes, 0, root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, @@ -814,7 +817,7 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -823,7 +826,7 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -876,9 +879,9 @@ static int prepare_uptodate_page(struct inode *inode, return 0; } -static unsigned int get_prepare_fgp_flags(bool nowait) +static fgf_t get_prepare_fgp_flags(bool nowait) { - unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; if (nowait) fgp_flags |= FGP_NOWAIT; @@ -910,7 +913,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - unsigned int fgp_flags = get_prepare_fgp_flags(nowait); + fgf_t fgp_flags = get_prepare_fgp_flags(nowait); int err = 0; int faili; @@ -1108,17 +1111,19 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) static void update_time_for_write(struct inode *inode) { - struct timespec64 now; + struct timespec64 now, ts; if (IS_NOCMTIME(inode)) return; now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) - inode->i_mtime = now; + ts = inode_get_mtime(inode); + if (!timespec64_equal(&ts, &now)) + inode_set_mtime_to_ts(inode, now); - if (!timespec64_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; + ts = inode_get_ctime(inode); + if (!timespec64_equal(&ts, &now)) + inode_set_ctime_to_ts(inode, now); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); @@ -1466,8 +1471,13 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; - /* If the write DIO is within EOF, use a shared lock */ - if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) + /* + * If the write DIO is within EOF, use a shared lock and also only if + * security bits will likely not be dropped by file_remove_privs() called + * from btrfs_write_check(). Either will need to be rechecked after the + * lock was acquired. + */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) ilock_flags |= BTRFS_ILOCK_SHARED; relock: @@ -1475,6 +1485,13 @@ relock: if (err < 0) return err; + /* Shared lock cannot be used with security bits set. */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + err = generic_write_checks(iocb, from); if (err <= 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); @@ -1733,7 +1750,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) struct btrfs_inode *inode = BTRFS_I(ctx->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (btrfs_inode_in_log(inode, fs_info->generation) && + if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && list_empty(&ctx->ordered_extents)) return true; @@ -1744,7 +1761,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) * and for a fast fsync we don't wait for that, we only wait for the * writeback to complete. */ - if (inode->last_trans <= fs_info->last_trans_committed && + if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || list_empty(&ctx->ordered_extents))) return true; @@ -1873,7 +1890,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - smp_mb(); if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were @@ -2091,7 +2107,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2099,7 +2115,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(fs_info, path, &key); + btrfs_set_item_key_safe(trans, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2108,7 +2124,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2260,7 +2276,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, @@ -2290,7 +2306,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, extent_info->disk_offset, - extent_info->disk_len, 0); + extent_info->disk_len, 0, + root->root_key.objectid); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, btrfs_ino(inode), ref_offset, 0, false); @@ -2459,12 +2476,11 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, */ inode_inc_iversion(&inode->vfs_inode); - if (!extent_info || extent_info->update_times) { - inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); - inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; - } + if (!extent_info || extent_info->update_times) + inode_set_mtime_to_ts(&inode->vfs_inode, + inode_set_ctime_current(&inode->vfs_inode)); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -2703,9 +2719,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -2721,18 +2736,17 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ - struct timespec64 now = current_time(inode); + struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); - inode->i_mtime = now; - inode->i_ctime = now; + inode_set_mtime_to_ts(inode, now); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { int ret2; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; @@ -2796,10 +2810,10 @@ static int btrfs_fallocate_update_isize(struct inode *inode, if (IS_ERR(trans)) return PTR_ERR(trans); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; @@ -3018,7 +3032,7 @@ static long btrfs_fallocate(struct file *file, int mode, struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; - struct list_head reserve_list; + LIST_HEAD(reserve_list); u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -3110,7 +3124,6 @@ static long btrfs_fallocate(struct file *file, int mode, btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ - INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset); @@ -3179,7 +3192,7 @@ static long btrfs_fallocate(struct file *file, int mode, qgroup_reserved -= range->len; } else if (qgroup_reserved > 0) { btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, - range->start, range->len); + range->start, range->len, NULL); qgroup_reserved -= range->len; } list_del(&range->list); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 880800418075..6f93c9a2c3e3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -57,6 +57,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); +static void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; @@ -195,7 +200,7 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -213,7 +218,7 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -354,7 +359,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); fail: if (locked) @@ -540,7 +545,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); @@ -562,7 +567,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, @@ -1185,7 +1190,7 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -1219,10 +1224,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - ret = find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL); - if (ret) + if (!find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -1322,7 +1326,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1363,7 +1367,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, /* * Write out cached info to an inode. * - * @root: root the inode belongs to * @inode: freespace inode we are writing out * @ctl: free space cache we are going to write out * @block_group: block_group for this cache if it belongs to a block_group @@ -1374,7 +1377,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, +static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, @@ -1507,7 +1510,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; @@ -1533,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, - block_group, &block_group->io_ctl, trans); + ret = __btrfs_write_out_cache(inode, ctl, block_group, + &block_group->io_ctl, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", @@ -2705,13 +2708,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); - /* Count initial region as zone_unusable until it gets activated. */ if (!used) to_free = size; - else if (initial && - test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && - (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) - to_free = 0; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) @@ -2739,8 +2737,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length && - block_group->alloc_offset) { + if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= @@ -2944,7 +2941,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, - "%d blocks of free space at or bigger than bytes is", count); + "%d free space entries at or bigger than %llu bytes", + count, bytes); } void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index f169378e2ca6..7b598b070700 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -89,7 +89,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -287,7 +287,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, flags |= BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { @@ -324,7 +324,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; @@ -430,7 +430,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; @@ -495,7 +495,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && @@ -533,7 +533,8 @@ int free_space_test_bit(struct btrfs_block_group *block_group, return !!extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_block_group *block_group, +static void free_space_set_bits(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 *start, u64 *size, int bit) { @@ -563,7 +564,7 @@ static void free_space_set_bits(struct btrfs_block_group *block_group, extent_buffer_bitmap_set(leaf, ptr, first, last - first); else extent_buffer_bitmap_clear(leaf, ptr, first, last - first); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); *size -= end - *start; *start = end; @@ -656,7 +657,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, cur_start = start; cur_size = size; while (1) { - free_space_set_bits(block_group, path, &cur_start, &cur_size, + free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, !remove); if (cur_size == 0) break; @@ -1517,8 +1518,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } else if (prev_bit == 1 && bit == 0) { u64 space_added; - ret = add_new_free_space(block_group, extent_start, - offset, &space_added); + ret = btrfs_add_new_free_space(block_group, + extent_start, + offset, + &space_added); if (ret) goto out; total_found += space_added; @@ -1533,7 +1536,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - ret = add_new_free_space(block_group, extent_start, end, NULL); + ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) goto out; extent_count++; @@ -1590,8 +1593,9 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - ret = add_new_free_space(block_group, key.objectid, - key.objectid + key.offset, &space_added); + ret = btrfs_add_new_free_space(block_group, key.objectid, + key.objectid + key.offset, + &space_added); if (ret) goto out; total_found += space_added; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 203d2a267828..318df6f9d9cb 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -46,8 +46,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); * Runtime (in-memory) states of filesystem */ enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, /* * Filesystem is being remounted, allow to skip some operations, like * defrag @@ -141,6 +139,12 @@ enum { */ BTRFS_FS_FEATURE_CHANGED, + /* + * Indicate that we have found a tree block which is only aligned to + * sectorsize, but not to nodesize. This should be rare nowadays. + */ + BTRFS_FS_UNALIGNED_TREE_BLOCK, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -173,19 +177,17 @@ enum { BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), - BTRFS_MOUNT_NODISCARD = (1UL << 31), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24), + BTRFS_MOUNT_REF_VERIFY = (1UL << 25), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), + BTRFS_MOUNT_NODISCARD = (1UL << 29), }; /* @@ -218,7 +220,8 @@ enum { BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) #ifdef CONFIG_BTRFS_DEBUG /* @@ -227,6 +230,7 @@ enum { */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) #else @@ -371,6 +375,7 @@ struct btrfs_fs_info { struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; + struct btrfs_root *stripe_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -411,7 +416,17 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; + /* + * Updated while holding the lock 'trans_lock'. Due to the life cycle of + * a transaction, it can be directly read while holding a transaction + * handle, everywhere else must be read with btrfs_get_fs_generation(). + * Should always be updated using btrfs_set_fs_generation(). + */ u64 generation; + /* + * Always use btrfs_get_last_trans_committed() and + * btrfs_set_last_trans_committed() to read and update this field. + */ u64 last_trans_committed; /* * Generation of the last transaction used for block group relocation @@ -647,9 +662,6 @@ struct btrfs_fs_info { struct btrfs_discard_ctl discard_ctl; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif /* Is qgroup tracking in a consistent state? */ u64 qgroup_flags; @@ -685,6 +697,13 @@ struct btrfs_fs_info { /* Protected by qgroup_rescan_lock */ bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; + u64 qgroup_enable_gen; + + /* + * If this is not 0, then it indicates a serious filesystem error has + * happened and it contains that error (negative errno value). + */ + int fs_error; /* Filesystem state */ unsigned long fs_state; @@ -766,6 +785,9 @@ struct btrfs_fs_info { u64 data_reloc_bg; struct mutex zoned_data_reloc_io_lock; + struct btrfs_block_group *active_meta_bg; + struct btrfs_block_group *active_system_bg; + u64 nr_global_roots; spinlock_t zone_active_bgs_lock; @@ -805,6 +827,26 @@ struct btrfs_fs_info { #endif }; +static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->generation); +} + +static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->generation, gen); +} + +static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_trans_committed); +} + +static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->last_trans_committed, gen); +} + static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, u64 gen) { @@ -962,8 +1004,8 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); } -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) +#define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) + #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 4c322b720a80..7d734830e514 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -167,7 +167,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(path, item_size - del_len, 1); + btrfs_truncate_item(trans, path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -229,7 +229,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(path, item_size - sub_item_len, 1); + btrfs_truncate_item(trans, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -247,7 +247,7 @@ out: } /* - * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * Insert an extended inode ref into a tree. * * The caller must have checked against BTRFS_LINK_MAX already. */ @@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, name)) goto out; - btrfs_extend_item(path, ins_len); + btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) @@ -299,7 +299,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); @@ -338,7 +338,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size(path->nodes[0], path->slots[0]); - btrfs_extend_item(path, ins_len); + btrfs_extend_item(trans, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); @@ -364,7 +364,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); @@ -591,7 +591,7 @@ search_again: num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); @@ -617,7 +617,7 @@ search_again: btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(path, size, 1); + btrfs_truncate_item(trans, path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to @@ -676,7 +676,8 @@ delete: bytes_deleted += extent_num_bytes; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0); + extent_start, extent_num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), control->ino, extent_offset, root->root_key.objectid, false); diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index ede43b6c6559..4337bb26f419 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -4,6 +4,7 @@ #define BTRFS_INODE_ITEM_H #include <linux/types.h> +#include <linux/crc32c.h> struct btrfs_trans_handle; struct btrfs_root; @@ -12,6 +13,7 @@ struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; struct extent_buffer; +struct fscrypt_str; /* * Return this if we need to call truncate_block for the last bit of the @@ -76,6 +78,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, *ro_flags = (u32)(inode_item_flags >> 32); } +/* Figure the key offset of an extended inode ref. */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) +{ + return (u64)crc32c(parent_objectid, name, len); +} + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa090b0b5d29..fb3c3f43c3fa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ #include "super.h" #include "orphan.h" #include "backref.h" +#include "raid-stripe-tree.h" struct btrfs_iget_args { u64 ino; @@ -124,11 +125,11 @@ static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); -static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset); + +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -348,7 +349,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } /* - * btrfs_inode_lock - lock inode i_rwsem based on arguments passed + * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * @@ -382,7 +383,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * btrfs_inode_unlock - unock inode i_rwsem + * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -423,11 +424,10 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, while (index <= end_index) { /* - * For locked page, we will call end_extent_writepage() on it - * in run_delalloc_range() for the error handling. That - * end_extent_writepage() function will call - * btrfs_mark_ordered_io_finished() to clear page Ordered and - * run the ordered extent accounting. + * For locked page, we will call btrfs_mark_ordered_io_finished + * through btrfs_mark_ordered_io_finished() on it + * in run_delalloc_range() for the error handling, which will + * clear page Ordered and run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting @@ -574,7 +574,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, kunmap_local(kaddr); put_page(page); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -671,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, } btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -688,7 +688,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -815,24 +815,22 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, } /* - * we create compressed extents in two phases. The first - * phase compresses a range of pages that have already been - * locked (both pages and state bits are locked). + * Work queue call back to started compression on a file and pages. * - * This is done inside an ordered work queue, and the compression - * is spread across many cpus. The actual IO submission is step - * two, and the ordered work queue takes care of making sure that - * happens in the same order things were put onto the queue by - * writepages and friends. + * This is done inside an ordered work queue, and the compression is spread + * across many cpus. The actual IO submission is step two, and the ordered work + * queue takes care of making sure that happens in the same order things were + * put onto the queue by writepages and friends. * - * If this code finds it can't get good compression, it puts an - * entry onto the work queue to write the uncompressed bytes. This - * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that the flusher thread sent them - * down. + * If this code finds it can't get good compression, it puts an entry onto the + * work queue to write the uncompressed bytes. This makes sure that both + * compressed inodes and uncompressed inodes are written in the same order that + * the flusher thread sent them down. */ -static noinline int compress_file_range(struct async_chunk *async_chunk) +static void compress_file_range(struct btrfs_work *work) { + struct async_chunk *async_chunk = + container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -842,19 +840,24 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages = NULL; + struct page **pages; unsigned long nr_pages; unsigned long total_compressed = 0; unsigned long total_in = 0; + unsigned int poff; int i; - int will_compress; int compress_type = fs_info->compress_type; - int compressed_extents = 0; - int redirty = 0; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* + * We need to call clear_page_dirty_for_io on each page in the range. + * Otherwise applications with the file mmap'd can wander in and change + * the page contents while we are compressing them. + */ + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size @@ -868,7 +871,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - will_compress = 0; + pages = NULL; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); @@ -912,78 +915,57 @@ again: ret = 0; /* - * we do compression for mount -o compress and when the - * inode has not been flagged as nocompress. This flag can - * change at any time if we discover bad compression ratios. + * We do compression for mount -o compress and when the inode has not + * been flagged as NOCOMPRESS. This flag can change at any time if we + * discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { - WARN_ON(pages); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { - /* just bail out to the uncompressed code */ - nr_pages = 0; - goto cont; - } - - if (inode->defrag_compress) - compress_type = inode->defrag_compress; - else if (inode->prop_compress) - compress_type = inode->prop_compress; + if (!inode_need_compress(inode, start, end)) + goto cleanup_and_bail_uncompressed; + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { /* - * we need to call clear_page_dirty_for_io on each - * page in the range. Otherwise applications with the file - * mmap'd can wander in and change the page contents while - * we are compressing them. - * - * If the compression fails for any reason, we set the pages - * dirty again later on. - * - * Note that the remaining part is redirtied, the start pointer - * has moved, the end is the original one. + * Memory allocation failure is not a fatal error, we can fall + * back to uncompressed code. */ - if (!redirty) { - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); - redirty = 1; - } + goto cleanup_and_bail_uncompressed; + } - /* Compression level is applied here and only here */ - ret = btrfs_compress_pages( - compress_type | (fs_info->compress_level << 4), - mapping, start, - pages, - &nr_pages, - &total_in, - &total_compressed); + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; + + /* Compression level is applied here. */ + ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), + mapping, start, pages, &nr_pages, &total_in, + &total_compressed); + if (ret) + goto mark_incompressible; - if (!ret) { - unsigned long offset = offset_in_page(total_compressed); - struct page *page = pages[nr_pages - 1]; + /* + * Zero the tail end of the last page, as we might be sending it down + * to disk. + */ + poff = offset_in_page(total_compressed); + if (poff) + memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); - /* zero the tail end of the last page, we might be - * sending it down to disk - */ - if (offset) - memzero_page(page, offset, PAGE_SIZE - offset); - will_compress = 1; - } - } -cont: /* + * Try to create an inline extent. + * + * If we didn't compress the entire range, try to create an uncompressed + * inline extent, else a compressed one. + * * Check cow_file_range() for why we don't even try to create inline - * extent for subpage case. + * extent for the subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - /* lets try to make an inline extent */ - if (ret || total_in < actual_end) { - /* we didn't compress the entire range, try - * to make an uncompressed inline extent. - */ - ret = cow_file_range_inline(inode, actual_end, - 0, BTRFS_COMPRESS_NONE, - NULL, false); + if (total_in < actual_end) { + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, + false); } else { - /* try making a compressed inline extent */ ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, @@ -1013,99 +995,52 @@ cont: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - - /* - * Ensure we only free the compressed pages if we have - * them allocated, as we can still reach here with - * inode_need_compress() == false. - */ - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); - } - return 0; + goto free_pages; } } - if (will_compress) { - /* - * we aren't doing an inline extent round the compressed size - * up to a block size boundary so the allocator does sane - * things - */ - total_compressed = ALIGN(total_compressed, blocksize); + /* + * We aren't doing an inline extent. Round the compressed size up to a + * block size boundary so the allocator does sane things. + */ + total_compressed = ALIGN(total_compressed, blocksize); - /* - * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk, - * compression must free at least one sector size - */ - total_in = round_up(total_in, fs_info->sectorsize); - if (total_compressed + blocksize <= total_in) { - compressed_extents++; + /* + * One last check to make sure the compression is really a win, compare + * the page count read with the blocks on disk, compression must free at + * least one sector. + */ + total_in = round_up(total_in, fs_info->sectorsize); + if (total_compressed + blocksize > total_in) + goto mark_incompressible; - /* - * The async work queues will take care of doing actual - * allocation on disk for these compressed pages, and - * will submit them to the elevator. - */ - add_async_extent(async_chunk, start, total_in, - total_compressed, pages, nr_pages, - compress_type); - - if (start + total_in < end) { - start += total_in; - pages = NULL; - cond_resched(); - goto again; - } - return compressed_extents; - } + /* + * The async work queues will take care of doing actual allocation on + * disk for these compressed pages, and will submit the bios. + */ + add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + if (start + total_in < end) { + start += total_in; + cond_resched(); + goto again; } + return; + +mark_incompressible: + if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) + inode->flags |= BTRFS_INODE_NOCOMPRESS; +cleanup_and_bail_uncompressed: + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); +free_pages: if (pages) { - /* - * the compression code ran but failed to make things smaller, - * free any pages it allocated and our page pointer array - */ for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); - pages = NULL; - total_compressed = 0; - nr_pages = 0; - - /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(inode->prop_compress)) { - inode->flags |= BTRFS_INODE_NOCOMPRESS; - } - } -cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in the file - * we've been given so far. redirty the locked page if it corresponds - * to our extent and set things up for the async work queue to run - * cow_file_range to do the normal delalloc dance. - */ - if (async_chunk->locked_page && - (page_offset(async_chunk->locked_page) >= start && - page_offset(async_chunk->locked_page)) <= end) { - __set_page_dirty_nobuffers(async_chunk->locked_page); - /* unlocked later on in the async handlers */ } - - if (redirty) - extent_range_redirty_for_io(&inode->vfs_inode, start, end); - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); - compressed_extents++; - - return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -1124,14 +1059,12 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -static int submit_uncompressed_range(struct btrfs_inode *inode, - struct async_extent *async_extent, - struct page *locked_page) +static void submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; - unsigned long nr_written = 0; - int page_started = 0; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -1140,45 +1073,30 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, .no_cgroup_owner = 1, }; - /* - * Call cow_file_range() to run the delalloc range directly, since we - * won't go to NOCOW or async path again. - * - * Also we call cow_file_range() with @unlock_page == 0, so that we - * can directly submit them without interruption. - */ - ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0, NULL); - /* Inline extent inserted, page gets unlocked and everything is done */ - if (page_started) - return 0; - + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); - const u64 page_end = page_start + PAGE_SIZE - 1; set_page_writeback(locked_page); end_page_writeback(locked_page); - end_extent_writepage(locked_page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, locked_page, + page_start, PAGE_SIZE, + !ret); + mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } - return ret; } - - /* All pages will be unlocked, including @locked_page */ - wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); - wbc_detach_inode(&wbc); - return ret; } -static int submit_one_async_extent(struct btrfs_inode *inode, - struct async_chunk *async_chunk, - struct async_extent *async_extent, - u64 *alloc_hint) +static void submit_one_async_extent(struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) { + struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1206,9 +1124,8 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } lock_extent(io_tree, start, end, NULL); - /* We have fall back to uncompressed write */ - if (!async_extent->pages) { - ret = submit_uncompressed_range(inode, async_extent, locked_page); + if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + submit_uncompressed_range(inode, async_extent, locked_page); goto done; } @@ -1217,7 +1134,6 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { - free_async_extent_pages(async_extent); /* * Here we used to try again by going back to non-compressed * path for ENOSPC. But we can't reserve space even for @@ -1272,7 +1188,7 @@ done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); kfree(async_extent); - return ret; + return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1286,39 +1202,13 @@ out_free: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); - goto done; -} - -/* - * Phase two of compressed writeback. This is the ordered portion of the code, - * which only gets called in the order the work was queued. We walk all the - * async extents created by compress_file_range and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) -{ - struct btrfs_inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_extent *async_extent; - u64 alloc_hint = 0; - int ret = 0; - - while (!list_empty(&async_chunk->extents)) { - u64 extent_start; - u64 ram_size; - - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - extent_start = async_extent->start; - ram_size = async_extent->ram_size; - - ret = submit_one_async_extent(inode, async_chunk, async_extent, - &alloc_hint); - btrfs_debug(fs_info, + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); + btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - inode->root->root_key.objectid, - btrfs_ino(inode), extent_start, ram_size, ret); - } + root->root_key.objectid, btrfs_ino(inode), start, + async_extent->ram_size, ret); + kfree(async_extent); } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, @@ -1362,25 +1252,18 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * *page_started is set to one if we unlock locked_page and do everything - * required to start IO on it. It may be clean and already done with - * IO when we return. - * - * When unlock == 1, we unlock the pages in successfully allocated regions. - * When unlock == 0, we leave them locked for writing them out. + * When this function fails, it unlocks all pages except @locked_page. * - * However, we unlock all the pages except @locked_page in case of failure. + * When this function successfully creates an inline extent, it returns 1 and + * unlocks all pages including locked_page and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_page is + * the only page handled anyway). * - * In summary, page locking state will be as follow: + * When this function succeed and creates a normal extent, the page locking + * status depends on the passed in flags: * - * - page_started == 1 (return value) - * - All the pages are unlocked. IO is started. - * - Note that this can happen only on success - * - unlock == 1 - * - All the pages except @locked_page are unlocked in any case - * - unlock == 0 - * - On success, all the pages are locked for writing out them - * - On failure, all the pages except @locked_page are unlocked + * - If @keep_locked is set, all pages are kept locked. + * - Else all pages except for @locked_page are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1389,10 +1272,9 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset) + struct page *locked_page, u64 start, u64 end, + u64 *done_offset, + bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1431,7 +1313,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); @@ -1451,9 +1333,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - *nr_written = *nr_written + - (end - start + PAGE_SIZE) / PAGE_SIZE; - *page_started = 1; /* * locked_page is locked by the caller of * writepage_delalloc(), not locked by @@ -1463,11 +1342,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * as it doesn't have any subpage::writers recorded. * * Here we manually unlock the page, since the caller - * can't use page_started to determine if it's an - * inline extent or a compressed extent. + * can't determine if it's an inline extent or a + * compressed extent. */ unlock_page(locked_page); - goto out; + ret = 1; + goto done; } else if (ret < 0) { goto out_unlock; } @@ -1498,6 +1378,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + /* + * btrfs_reserve_extent only returns -EAGAIN for zoned + * file systems, which is an indication that there are + * no active zones to allocate from at the moment. + * + * If this is the first loop iteration, wait for at + * least one zone to finish before retrying the + * allocation. Otherwise ask the caller to write out + * the already allocated blocks before coming back to + * us, or return -ENOSPC if it can't handle retries. + */ + ASSERT(btrfs_is_zoned(fs_info)); + if (start == orig_start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + if (done_offset) { + *done_offset = start - 1; + return 0; + } + ret = -ENOSPC; + } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; @@ -1558,7 +1463,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ - page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, @@ -1581,7 +1486,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } -out: +done: + if (done_offset) + *done_offset = end; return ret; out_drop_extent_cache: @@ -1591,21 +1498,6 @@ out_reserve: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: /* - * If done_offset is non-NULL and ret == -EAGAIN, we expect the - * caller to write out the successfully allocated region and retry. - */ - if (done_offset && ret == -EAGAIN) { - if (orig_start < start) - *done_offset = start - 1; - else - *done_offset = start; - return ret; - } else if (ret == -EAGAIN) { - /* Convert to -ENOSPC since the caller cannot retry. */ - ret = -ENOSPC; - } - - /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| @@ -1627,10 +1519,10 @@ out_unlock: * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * - * However, in case of unlock == 0, we still need to unlock the pages + * However, in case of @keep_locked, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ - if (!unlock && orig_start < start) { + if (keep_locked && orig_start < start) { if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, @@ -1671,43 +1563,46 @@ out_unlock: } /* - * work queue call back to started compression on a file and pages - */ -static noinline void async_cow_start(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - int compressed_extents; - - async_chunk = container_of(work, struct async_chunk, work); - - compressed_extents = compress_file_range(async_chunk); - if (compressed_extents == 0) { - btrfs_add_delayed_iput(async_chunk->inode); - async_chunk->inode = NULL; - } -} - -/* - * work queue call back to submit previously compressed pages + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. + * + * If called with @do_free == true then it'll try to finish the work and free + * the work struct eventually. */ -static noinline void async_cow_submit(struct btrfs_work *work) +static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); + struct async_extent *async_extent; unsigned long nr_pages; + u64 alloc_hint = 0; + + if (do_free) { + struct async_chunk *async_chunk; + struct async_cow *async_cow; + + async_chunk = container_of(work, struct async_chunk, work); + btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); + return; + } nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* - * ->inode could be NULL if async_chunk_start has failed to compress, - * in which case we don't have anything to submit, yet we need to - * always adjust ->async_delalloc_pages as its paired with the init - * happening in run_delalloc_compressed - */ - if (async_chunk->inode) - submit_compressed_extents(async_chunk); + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + submit_one_async_extent(async_chunk, async_extent, &alloc_hint); + } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < @@ -1715,27 +1610,9 @@ static noinline void async_cow_submit(struct btrfs_work *work) cond_wake_up_nomb(&fs_info->async_submit_wait); } -static noinline void async_cow_free(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - struct async_cow *async_cow; - - async_chunk = container_of(work, struct async_chunk, work); - if (async_chunk->inode) - btrfs_add_delayed_iput(async_chunk->inode); - if (async_chunk->blkcg_css) - css_put(async_chunk->blkcg_css); - - async_cow = async_chunk->async_cow; - if (atomic_dec_and_test(&async_cow->num_chunks)) - kvfree(async_cow); -} - static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); @@ -1809,65 +1686,42 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, async_chunk[i].blkcg_css = NULL; } - btrfs_init_work(&async_chunk[i].work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_chunk[i].work, compress_file_range, + submit_compressed_extents); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); - *nr_written += nr_pages; start = cur_end + 1; } - *page_started = 1; return true; } -static noinline int run_delalloc_zoned(struct btrfs_inode *inode, - struct page *locked_page, u64 start, - u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +/* + * Run the delalloc range from start to end, and write back any dirty pages + * covered by the range. + */ +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty) { u64 done_offset = end; int ret; - bool locked_page_done = false; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0, &done_offset); - if (ret && ret != -EAGAIN) + ret = cow_file_range(inode, locked_page, start, end, &done_offset, + true, false); + if (ret) return ret; - - if (*page_started) { - ASSERT(ret == 0); - return 0; - } - - if (ret == 0) - done_offset = end; - - if (done_offset == start) { - wait_on_bit_io(&inode->root->fs_info->flags, - BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - continue; - } - - if (!locked_page_done) { - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - } - locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset, - wbc); + extent_write_locked_range(&inode->vfs_inode, locked_page, start, + done_offset, wbc, pages_dirty); start = done_offset + 1; } - *page_started = 1; - - return 0; + return 1; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, @@ -1894,8 +1748,7 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, unsigned long *nr_written) + const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1903,6 +1756,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; + int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was @@ -1955,8 +1809,14 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, NULL); } - return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1, NULL); + /* + * Don't try to create inline extents, as a mix of inline extent that + * is written out and unlocked directly and a normal NOCOW extent + * doesn't work. + */ + ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ASSERT(ret != 1); + return ret; } struct can_nocow_file_extent_args { @@ -2105,9 +1965,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, - unsigned long *nr_written) + const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -2117,25 +1975,26 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); - struct btrfs_block_group *bg; - bool nocow = false; struct can_nocow_file_extent_args nocow_args = { 0 }; + /* + * Normally on a zoned device we're only doing COW writes, but in case + * of relocation on a zoned filesystem serializes I/O so that we're only + * writing sequentially and can end up here as well. + */ + ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - return -ENOMEM; + ret = -ENOMEM; + goto error; } nocow_args.end = end; nocow_args.writeback_path = true; while (1) { + struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; @@ -2146,8 +2005,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int extent_type; bool is_prealloc; - nocow = false; - ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) @@ -2172,11 +2029,8 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } if (ret > 0) break; leaf = path->nodes[0]; @@ -2209,7 +2063,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; - goto out_check; + goto must_cow; } /* @@ -2239,24 +2093,22 @@ next_slot: nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } else if (ret == 0) { - goto out_check; - } + if (ret == 0) + goto must_cow; ret = 0; - bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); - if (bg) - nocow = true; -out_check: - /* - * If nocow is false then record the beginning of the range - * that needs to be COWed - */ - if (!nocow) { + nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (!nocow_bg) { +must_cow: + /* + * If we can't perform NOCOW writeback for the range, + * then record the beginning of the range that needs to + * be COWed. It will be written out before the next + * NOCOW range if we find one, or when exiting this + * loop. + */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; @@ -2275,11 +2127,12 @@ out_check: */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written); - if (ret) - goto error; + cow_start, found_key.offset - 1); cow_start = (u64)-1; + if (ret) { + btrfs_dec_nocow_writers(nocow_bg); + goto error; + } } nocow_end = cur_offset + nocow_args.num_bytes - 1; @@ -2296,6 +2149,7 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; } @@ -2309,6 +2163,7 @@ out_check: ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW), BTRFS_COMPRESS_NONE); + btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, @@ -2318,11 +2173,6 @@ out_check: goto error; } - if (nocow) { - btrfs_dec_nocow_writers(bg); - nocow = false; - } - if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent @@ -2357,17 +2207,24 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end, - page_started, nr_written); + ret = fallback_to_cow(inode, locked_page, cow_start, end); + cow_start = (u64)-1; if (ret) goto error; } -error: - if (nocow) - btrfs_dec_nocow_writers(bg); + btrfs_free_path(path); + return 0; - if (ret && cur_offset < end) +error: + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to cow_start to ensure the COW region is unlocked + * as well. + */ + if (cow_start != (u64)-1) + cur_offset = cow_start; + if (cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | @@ -2382,8 +2239,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, - 0, NULL)) + test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2395,49 +2251,37 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc) + u64 start, u64 end, struct writeback_control *wbc) { - int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + int ret; /* - * The range must cover part of the @locked_page, or the returned - * @page_started can confuse the caller. + * The range must cover part of the @locked_page, or a return of 1 + * can confuse the caller. */ ASSERT(!(end <= page_offset(locked_page) || start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { - /* - * Normally on a zoned device we're only doing COW writes, but - * in case of relocation on a zoned filesystem we have taken - * precaution, that we're only writing sequentially. It's safe - * to use run_delalloc_nocow() here, like for regular - * preallocated inodes. - */ - ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, nr_written); + ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, wbc, locked_page, start, - end, page_started, nr_written)) - goto out; + run_delalloc_compressed(inode, locked_page, start, end, wbc)) + return 1; if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written, wbc); + ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + true); else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, start, end, NULL, + false, false); out: - ASSERT(ret <= 0); - if (ret) + if (ret < 0) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; @@ -2840,23 +2684,19 @@ struct btrfs_writepage_fixup { static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { - struct btrfs_writepage_fixup *fixup; + struct btrfs_writepage_fixup *fixup = + container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page; - struct btrfs_inode *inode; - u64 page_start; - u64 page_end; + struct page *page = fixup->page; + struct btrfs_inode *inode = fixup->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 page_start = page_offset(page); + u64 page_end = page_offset(page) + PAGE_SIZE - 1; int ret = 0; bool free_delalloc_space = true; - fixup = container_of(work, struct btrfs_writepage_fixup, work); - page = fixup->page; - inode = fixup->inode; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; - /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. @@ -2949,10 +2789,11 @@ out_page: * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, page, page_start, + PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -3009,7 +2850,7 @@ int btrfs_writepage_cow_fixup(struct page *page) ihold(inode); btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); get_page(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -3074,7 +2915,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -3232,7 +3073,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -3253,6 +3094,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; + ret = btrfs_insert_raid_extent(trans, ordered_extent); + if (ret) + goto out; + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { @@ -3298,7 +3143,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -3359,6 +3204,13 @@ out: btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); + /* + * Actually free the qgroup rsv which was released when + * the ordered extent was created. + */ + btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + ordered_extent->qgroup_rsv, + BTRFS_QGROUP_RSV_DATA); } } @@ -3379,20 +3231,12 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && - !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered); } -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate) -{ - trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); -} - /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. @@ -3446,7 +3290,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, if (btrfs_is_data_reloc_root(inode->root) && test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - 1, NULL)) { + NULL)) { /* Skip the range without csum for data reloc inode */ clear_extent_bits(&inode->io_tree, file_offset, end, EXTENT_NODATASUM); @@ -3470,7 +3314,7 @@ zeroit: } /* - * btrfs_add_delayed_iput - perform a delayed iput on @inode + * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * @@ -3662,9 +3506,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { + /* + * We found the same inode as before. This means we were + * not able to remove its items via eviction triggered + * by an iput(). A transaction abort may have happened, + * due to -ENOSPC for example, so try to grab the error + * that lead to a transaction abort, if any. + */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); - ret = -EINVAL; + ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } @@ -3911,19 +3762,17 @@ static int btrfs_read_locked_inode(struct inode *inode, btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); + inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_timespec_nsec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3949,7 +3798,7 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in delayed_nodes_tree. */ - if (BTRFS_I(inode)->last_trans == fs_info->generation) + if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -4079,24 +3928,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); btrfs_set_token_inode_generation(&token, item, @@ -4114,8 +3961,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) + struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; @@ -4126,7 +3972,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); + ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -4138,7 +3984,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -4149,10 +3995,10 @@ failed: /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -4168,23 +4014,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); - ret = btrfs_delayed_update_inode(trans, root, inode); + ret = btrfs_delayed_update_inode(trans, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); return ret; } - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); } int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) + struct btrfs_inode *inode) { int ret; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC) - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); return ret; } @@ -4289,10 +4135,8 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; - dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; - ret = btrfs_update_inode(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode(trans, dir); out: return ret; } @@ -4306,7 +4150,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, inode->root, inode); + ret = btrfs_update_inode(trans, inode); } return ret; } @@ -4464,9 +4308,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); - dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); - dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; - ret = btrfs_update_inode_fallback(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, dir); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -4800,7 +4643,8 @@ out_notrans: } /* - * btrfs_truncate_block - read, zero a chunk and write a block + * Read, zero a chunk and write a block. + * * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the @@ -4950,9 +4794,9 @@ out: return ret; } -static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, - u64 offset, u64 len) +static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; @@ -4992,7 +4836,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); return ret; @@ -5048,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - err = maybe_insert_hole(root, inode, cur_offset, - hole_size); + err = maybe_insert_hole(inode, cur_offset, hole_size); if (err) break; @@ -5075,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = fs_info->generation; + hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); @@ -5115,8 +4958,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); } } @@ -5144,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5289,7 +5132,7 @@ static void evict_inode_truncate_pages(struct inode *inode) */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, - end - start + 1); + end - start + 1, NULL); clear_extent_bit(io_tree, start, end, EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, @@ -5738,11 +5581,12 @@ struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root return btrfs_iget_path(s, ino, root, NULL); } -static struct inode *new_simple_dir(struct super_block *s, +static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { - struct inode *inode = new_inode(s); + struct timespec64 ts; + struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -5760,10 +5604,15 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = current_time(inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + + ts = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, ts); + inode_set_atime_to_ts(inode, inode_get_atime(dir)); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + + inode->i_uid = dir->i_uid; + inode->i_gid = dir->i_gid; return inode; } @@ -5822,7 +5671,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, root); + inode = new_simple_dir(dir, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); btrfs_put_root(sub_root); @@ -5924,20 +5773,24 @@ out: static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) { - if (dir->index_cnt == (u64)-1) { - int ret; + int ret = 0; + btrfs_inode_lock(dir, 0); + if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) - return ret; + goto out; } } - *index = dir->index_cnt; + /* index_cnt is the index number of next new entry, so decrement it. */ + *index = dir->index_cnt - 1; +out: + btrfs_inode_unlock(dir, 0); - return 0; + return ret; } /* @@ -5972,6 +5825,19 @@ static int btrfs_opendir(struct inode *inode, struct file *file) return 0; } +static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct btrfs_file_private *private = file->private_data; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), + &private->last_index); + if (ret) + return ret; + + return generic_file_llseek(file, offset, whence); +} + struct dir_entry { u64 ino; u64 offset; @@ -6007,8 +5873,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_key found_key; struct btrfs_path *path; void *addr; - struct list_head ins_list; - struct list_head del_list; + LIST_HEAD(ins_list); + LIST_HEAD(del_list); int ret; char *name_ptr; int name_len; @@ -6027,8 +5893,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); put = btrfs_readdir_get_delayed_items(inode, private->last_index, &ins_list, &del_list); @@ -6144,15 +6008,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); - if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { + ret = btrfs_update_inode(trans, inode); + if (ret == -ENOSPC || ret == -EDQUOT) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); if (inode->delayed_node) @@ -6165,23 +6029,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -static int btrfs_update_time(struct inode *inode, struct timespec64 *now, - int flags) +static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; - bool dirty = flags & ~S_VERSION; + bool dirty; if (btrfs_root_readonly(root)) return -EROFS; - if (flags & S_VERSION) - dirty |= inode_maybe_inc_iversion(inode, dirty); - if (flags & S_CTIME) - inode->i_ctime = *now; - if (flags & S_MTIME) - inode->i_mtime = *now; - if (flags & S_ATIME) - inode->i_atime = *now; + dirty = inode_update_timestamps(inode, flags); return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } @@ -6312,6 +6168,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { + struct timespec64 ts; struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; @@ -6429,10 +6286,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - inode->i_mtime = current_time(inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + ts = simple_inode_init_ts(inode); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; /* * We're going to fill the inode item now, so at this point the inode @@ -6463,7 +6319,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6596,13 +6452,11 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { - struct timespec64 now = current_time(&parent_inode->vfs_inode); + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); - parent_inode->vfs_inode.i_mtime = now; - parent_inode->vfs_inode.i_ctime = now; - } - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_update_inode(trans, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6741,7 +6595,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -6753,7 +6607,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); + err = btrfs_update_inode(trans, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { @@ -7129,8 +6983,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); +again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } if (ret) return ERR_PTR(ret); @@ -7258,8 +7119,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit(io_tree, offset, range_end, - EXTENT_DELALLOC, 0, NULL); + ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { ret = -EAGAIN; goto out; @@ -8160,11 +8020,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all @@ -8199,7 +8059,7 @@ next: * reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | @@ -8494,7 +8354,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -8547,7 +8407,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_update_inode(trans, inode); if (ret2 && !ret) ret = ret2; @@ -8636,8 +8496,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; - ei->i_otime.tv_sec = 0; - ei->i_otime.tv_nsec = 0; + ei->i_otime_sec = 0; + ei->i_otime_nsec = 0; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); @@ -8646,7 +8506,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); mutex_init(&ei->log_mutex); - btrfs_ordered_inode_tree_init(&ei->ordered_tree); + spin_lock_init(&ei->ordered_tree_lock); + ei->ordered_tree = RB_ROOT; + ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); @@ -8789,8 +8651,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap, u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; - stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; - stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; if (bi_flags & BTRFS_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) @@ -8807,7 +8669,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -8831,7 +8693,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec64 ctime = current_time(old_inode); struct btrfs_rename_ctx old_rename_ctx; struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); @@ -8962,12 +8823,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_mtime = ctime; - old_dir->i_ctime = ctime; - new_dir->i_mtime = ctime; - new_dir->i_ctime = ctime; - old_inode->i_ctime = ctime; - new_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -8984,7 +8840,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8999,7 +8855,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9231,11 +9087,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_mtime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime; - new_dir->i_mtime = old_dir->i_mtime; - new_dir->i_ctime = old_dir->i_mtime; - old_inode->i_ctime = old_dir->i_mtime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9248,7 +9100,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9257,7 +9109,6 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); @@ -9374,7 +9225,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); return work; } @@ -9390,14 +9241,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; - struct list_head works; - struct list_head splice; + LIST_HEAD(works); + LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; - INIT_LIST_HEAD(&works); - INIT_LIST_HEAD(&splice); - mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -9485,14 +9333,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, .range_end = LLONG_MAX, }; struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); int ret; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); @@ -9617,7 +9463,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -9645,7 +9491,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; - int qgroup_released; + u64 qgroup_released = 0; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9658,9 +9504,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); - if (qgroup_released < 0) - return ERR_PTR(qgroup_released); + ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); + if (ret < 0) + return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, @@ -9797,7 +9643,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && @@ -9810,7 +9656,7 @@ next: btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -10555,7 +10401,7 @@ out_delalloc_release: btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); out_qgroup_free_data: if (ret < 0) - btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented @@ -11052,7 +10898,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { }; static const struct file_operations btrfs_dir_file_operations = { - .llseek = generic_file_llseek, + .llseek = btrfs_dir_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, .open = btrfs_opendir, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a895d105464b..a1743904202b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -384,8 +384,8 @@ update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); @@ -652,18 +652,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* Tree log can't currently deal with an inode which is a new root. */ btrfs_set_log_full_commit(trans); - ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit); if (ret) goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto out; } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1290,6 +1290,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, * are limited to own subvolumes only */ ret = -EPERM; + } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { + /* + * Snapshots must be made with the src_inode referring + * to the subvolume inode, otherwise the permission + * checking above is useless because we may have + * permission on a lower directory but not the subvol + * itself. + */ + ret = -EINVAL; } else { ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, @@ -1528,7 +1537,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) @@ -1660,7 +1669,7 @@ out: static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); @@ -1733,7 +1742,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; - size_t buf_size; + u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1763,8 +1772,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; - size_t buf_size; - const size_t buf_limit = SZ_16M; + u64 buf_size; + const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1958,6 +1967,13 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } + /* + * We don't need the path anymore, so release it and + * avoid deadlocks and lockdep warnings in case + * btrfs_iget() needs to lookup the inode from its root + * btree and lock the same leaf. + */ + btrfs_release_path(path); temp_inode = btrfs_iget(sb, key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); @@ -1978,7 +1994,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; @@ -2629,6 +2644,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) return -EINVAL; } + if (fs_info->fs_devices->temp_fsid) { + btrfs_err(fs_info, + "device add not supported on cloned temp-fsid mount"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -2670,8 +2691,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct bdev_handle *bdev_handle = NULL; int ret; bool cancel = false; @@ -2708,7 +2728,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_handle); btrfs_exclop_finish(fs_info); @@ -2722,8 +2742,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_handle) + bdev_release(bdev_handle); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2736,8 +2756,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct bdev_handle *bdev_handle = NULL; int ret; bool cancel = false; @@ -2764,15 +2783,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_handle); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_handle) + bdev_release(bdev_handle); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2816,7 +2835,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { - fi_args->generation = fs_info->generation; + fi_args->generation = btrfs_get_fs_generation(fs_info); fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } @@ -2941,7 +2960,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -2972,7 +2991,7 @@ static void get_block_group_info(struct list_head *groups_list, static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; @@ -3125,7 +3144,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, return PTR_ERR(trans); /* No running transaction, don't bother */ - transid = root->fs_info->last_trans_committed; + transid = btrfs_get_last_trans_committed(root->fs_info); goto out; } transid = trans->transid; @@ -3691,7 +3710,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: - ret = btrfs_quota_enable(fs_info); + case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + ret = btrfs_quota_enable(fs_info, sa); break; case BTRFS_QUOTA_CTL_DISABLE: ret = btrfs_quota_disable(fs_info); @@ -4332,7 +4352,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_send_args_32 args32; + struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) @@ -4345,6 +4365,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat arg->clone_sources = compat_ptr(args32.clone_sources); arg->parent_root = args32.parent_root; arg->flags = args32.flags; + arg->version = args32.version; memcpy(arg->reserved, args32.reserved, sizeof(args32.reserved)); #else diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 7979449a58d6..74d8e2003f58 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -8,6 +8,7 @@ #include <linux/spinlock.h> #include <linux/page-flags.h> #include <asm/bug.h> +#include <trace/events/btrfs.h> #include "misc.h" #include "ctree.h" #include "extent_io.h" @@ -73,6 +74,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, + { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, { .id = 0, DEFINE_NAME("tree") }, }; @@ -102,6 +104,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff #endif +#ifdef CONFIG_BTRFS_DEBUG +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) +{ + eb->lock_owner = owner; +} +#else +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } +#endif + /* * Extent buffer locking * ===================== @@ -164,7 +175,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_write_lock(struct extent_buffer *eb) { if (down_write_trylock(&eb->lock)) { - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_try_tree_write_lock(eb); return 1; } @@ -181,7 +192,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) } /* - * __btrfs_tree_lock - lock eb for write + * Lock eb for write. + * * @eb: the eb to lock * @nest: the nesting to use for the lock * @@ -196,7 +208,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) start_ns = ktime_get_ns(); down_write_nested(&eb->lock, nest); - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_tree_lock(eb, start_ns); } @@ -211,7 +223,7 @@ void btrfs_tree_lock(struct extent_buffer *eb) void btrfs_tree_unlock(struct extent_buffer *eb) { trace_btrfs_tree_unlock(eb); - eb->lock_owner = 0; + btrfs_set_eb_lock_owner(eb, 0); up_write(&eb->lock); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index edb9b4a0dba1..7d6ee1e609bf 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -79,7 +79,7 @@ enum btrfs_lock_nesting { }; enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP, BTRFS_LOCKDEP_TRANS_UNBLOCKED, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, BTRFS_LOCKDEP_TRANS_COMPLETED, diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 23fc11af498a..b8f9c9e56c8c 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -10,14 +10,13 @@ #ifdef CONFIG_PRINTK #define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', @@ -37,6 +36,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); curr += sizeof(STATE_STRING_PREFACE) - 1; + if (BTRFS_FS_ERROR(info)) { + *curr++ = 'E'; + states_printed = true; + } + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { @@ -68,11 +72,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) * over the error. Each subsequent error that doesn't have any context * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. */ -const char * __attribute_const__ btrfs_decode_error(int errno) +const char * __attribute_const__ btrfs_decode_error(int error) { char *errstr = "unknown"; - switch (errno) { + switch (error) { case -ENOENT: /* -2 */ errstr = "No such entry"; break; @@ -106,12 +110,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno) } /* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. + * Decodes expected errors from the caller and invokes the appropriate error + * response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK @@ -128,11 +132,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Special case: if the error is EROFS, and we're already under * SB_RDONLY, then it is safe here. */ - if (errno == -EROFS && sb_rdonly(sb)) + if (error == -EROFS && sb_rdonly(sb)) return; #ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; @@ -143,11 +147,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.va = &args; pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); + sb->s_id, statestr, function, line, error, errstr, &vaf); va_end(args); } else { pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); + sb->s_id, statestr, function, line, error, errstr); } #endif @@ -155,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Today we only save the error info to memory. Long term we'll also * send it down to the disk. */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, error); /* Don't go through full error handling during mount. */ if (!(sb->s_flags & SB_BORN)) @@ -252,12 +256,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - #if BITS_PER_LONG == 32 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) { @@ -285,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an - * alert, and either panics or BUGs, depending on mount options. + * Decode unexpected, fatal errors from the caller, issue an alert, and either + * panic or BUGs, depending on mount options. */ __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { char *s_id = "<unknown>"; const char *errstr; @@ -303,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); + s_id, function, line, &vaf, error, errstr); btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); + function, line, &vaf, error, errstr); va_end(args); /* Caller calls BUG() */ } diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index deedc1a168e2..4d04c1fa5899 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -181,30 +181,28 @@ do { \ #define ASSERT(expr) (void)(expr) #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); - __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); + unsigned int line, int error, const char *fmt, ...); -const char * __attribute_const__ btrfs_decode_error(int errno); +const char * __attribute_const__ btrfs_decode_error(int error); -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +#define btrfs_handle_fs_error(fs_info, error, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) + (error), fmt, ##args) __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); + unsigned int line, int error, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic * will panic(). Otherwise we BUG() here. */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ +#define btrfs_panic(fs_info, error, fmt, args...) \ do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + __btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args); \ BUG(); \ } while (0) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 005751a12911..40f2d9f1a17a 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -8,8 +8,6 @@ #include <linux/math64.h> #include <linux/rbtree.h> -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a629532283bc..a82e1417c4d2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, * look find the first ordered struct that has this offset, otherwise * the first one less than this offset */ -static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, - u64 file_offset) +static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, + u64 file_offset) { - struct rb_root *root = &tree->tree; struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; - if (tree->last) { - entry = rb_entry(tree->last, struct btrfs_ordered_extent, + if (inode->ordered_tree_last) { + entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, rb_node); if (in_range(file_offset, entry->file_offset, entry->num_bytes)) - return tree->last; + return inode->ordered_tree_last; } - ret = __tree_search(root, file_offset, &prev); + ret = __tree_search(&inode->ordered_tree, file_offset, &prev); if (!ret) ret = prev; if (ret) - tree->last = ret; + inode->ordered_tree_last = ret; return ret; } @@ -153,11 +152,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( { struct btrfs_ordered_extent *entry; int ret; + u64 qgroup_rsv = 0; if (flags & ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } else { @@ -165,7 +165,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( * The ordered extent has reserved qgroup space, release now * and pass the reserved number for qgroup_record to free. */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } @@ -183,7 +183,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; - entry->qgroup_rsv = ret; + entry->qgroup_rsv = qgroup_rsv; entry->flags = flags; refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -191,6 +191,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); + INIT_LIST_HEAD(&entry->bioc_list); init_completion(&entry->completion); /* @@ -208,7 +209,6 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( static void insert_ordered_extent(struct btrfs_ordered_extent *entry) { struct btrfs_inode *inode = BTRFS_I(entry->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -221,13 +221,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); + spin_lock_irq(&inode->ordered_tree_lock); + node = tree_insert(&inode->ordered_tree, entry->file_offset, + &entry->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -287,12 +288,11 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_inode *inode = BTRFS_I(entry->inode); - tree = &BTRFS_I(entry->inode)->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } static void finish_ordered_fn(struct btrfs_work *work) @@ -310,7 +310,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - lockdep_assert_held(&inode->ordered_tree.lock); + lockdep_assert_held(&inode->ordered_tree_lock); if (page) { ASSERT(page->mapping); @@ -364,7 +364,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? fs_info->endio_freespace_worker : fs_info->endio_write_workers; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); btrfs_queue_work(wq, &ordered->work); } @@ -378,9 +378,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree.lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); if (ret) btrfs_queue_ordered_fn(ordered); @@ -404,19 +404,22 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, u64 num_bytes, bool uptodate) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; u64 cur = file_offset; - spin_lock_irqsave(&tree->lock, flags); + trace_btrfs_writepage_end_io_hook(inode, file_offset, + file_offset + num_bytes - 1, + uptodate); + + spin_lock_irqsave(&inode->ordered_tree_lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; u64 end; u32 len; - node = tree_search(tree, cur); + node = ordered_tree_search(inode, cur); /* No ordered extents at all */ if (!node) break; @@ -463,13 +466,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, len = end + 1 - cur; if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); } cur += len; } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); } /* @@ -493,19 +496,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; bool finished = false; - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); if (cached && *cached) { entry = *cached; goto have_entry; } - node = tree_search(tree, file_offset); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -536,7 +538,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return finished; } @@ -574,7 +576,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = btrfs_inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -599,22 +600,23 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, release = entry->disk_num_bytes; else release = entry->num_bytes; - btrfs_delalloc_release_metadata(btrfs_inode, release, false); + btrfs_delalloc_release_metadata(btrfs_inode, release, + test_bit(BTRFS_ORDERED_IOERR, + &entry->flags)); } percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - tree = &btrfs_inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (btrfs_inode->ordered_tree_last == node) + btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -635,7 +637,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); - ASSERT(trans); + ASSERT(trans || BTRFS_FS_ERROR(fs_info)); if (trans) { if (atomic_dec_and_test(&trans->pending_ordered)) wake_up(&trans->pending_wait); @@ -707,7 +709,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, - btrfs_run_ordered_extent_work, NULL, NULL); + btrfs_run_ordered_extent_work, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); @@ -736,11 +738,9 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); u64 done; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->ordered_operations_mutex); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); @@ -873,14 +873,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - tree = &inode->ordered_tree; - spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, file_offset); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -892,7 +890,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return entry; } @@ -902,15 +900,13 @@ out: struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) { - node = tree_search(tree, file_offset + len); + node = ordered_tree_search(inode, file_offset + len); if (!node) goto out; } @@ -934,7 +930,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -945,13 +941,12 @@ out: void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *n; ASSERT(inode_is_locked(&inode->vfs_inode)); - spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + spin_lock_irq(&inode->ordered_tree_lock); + for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); @@ -964,7 +959,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } /* @@ -974,13 +969,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -988,7 +981,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1004,15 +997,14 @@ out: struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct rb_node *cur; struct rb_node *prev; struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&tree->lock); - node = tree->tree.rb_node; + spin_lock_irq(&inode->ordered_tree_lock); + node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last * and screw up the search order. @@ -1066,7 +1058,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1145,7 +1137,6 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; @@ -1185,13 +1176,13 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( refcount_inc(&new->refs); spin_lock_irq(&root->ordered_extent_lock); - spin_lock(&tree->lock); + spin_lock(&inode->ordered_tree_lock); /* Remove from tree once */ node = &ordered->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (inode->ordered_tree_last == node) + inode->ordered_tree_last = NULL; ordered->file_offset += len; ordered->disk_bytenr += len; @@ -1222,18 +1213,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( } /* Re-insert the node */ - node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + node = tree_insert(&inode->ordered_tree, ordered->file_offset, + &ordered->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", ordered->file_offset); - node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); + node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", new->file_offset); - spin_unlock(&tree->lock); + spin_unlock(&inode->ordered_tree_lock); list_add_tail(&new->root_extent_list, &root->ordered_extents); root->nr_ordered_extents++; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 173bd5c5df26..567a6d3d4712 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,13 +6,6 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H -/* one of these per inode */ -struct btrfs_ordered_inode_tree { - spinlock_t lock; - struct rb_root tree; - struct rb_node *last; -}; - struct btrfs_ordered_sum { /* * Logical start address and length for of the blocks covered by @@ -151,15 +144,9 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; -}; -static inline void -btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) -{ - spin_lock_init(&t->lock); - t->tree = RB_ROOT; - t->last = NULL; -} + struct list_head bioc_list; +}; int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aa06d9ca911d..7e46aa8a0444 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -9,6 +9,8 @@ #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" +#include "volumes.h" +#include "raid-stripe-tree.h" struct root_name_map { u64 id; @@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = { { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, + { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" }, }; const char *btrfs_root_name(const struct btrfs_key *key, char *buf) @@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } +static void print_extent_owner_ref(const struct extent_buffer *eb, + const struct btrfs_extent_owner_ref *ref) +{ + ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA)); + pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref)); +} + static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + struct btrfs_extent_owner_ref *oref; struct btrfs_disk_key key; unsigned long end; unsigned long ptr; @@ -95,8 +106,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type int ref_index = 0; if (unlikely(item_size < sizeof(*ei))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); + btrfs_err(eb->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL); } ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); @@ -159,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + print_extent_owner_ref(eb, oref); + break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", eb->start, type); @@ -187,6 +204,22 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, } } +static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size, + struct btrfs_stripe_extent *stripe) +{ + const int num_stripes = btrfs_num_raid_stripes(item_size); + const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe); + + pr_info("\t\t\tencoding: %s\n", + (encoding && encoding < BTRFS_NR_RAID_TYPES) ? + btrfs_raid_array[encoding].raid_name : "unknown"); + + for (int i = 0; i < num_stripes; i++) + pr_info("\t\t\tstride %d devid %llu physical %llu\n", + i, btrfs_raid_stride_devid(eb, &stripe->strides[i]), + btrfs_raid_stride_physical(eb, &stripe->strides[i])); +} + /* * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. @@ -291,10 +324,6 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_file_extent_num_bytes(l, fi), btrfs_file_extent_ram_bytes(l, fi)); break; - case BTRFS_EXTENT_REF_V0_KEY: - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, -EINVAL, NULL); - break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); @@ -351,6 +380,10 @@ void btrfs_print_leaf(const struct extent_buffer *l) print_uuid_item(l, btrfs_item_ptr_offset(l, i), btrfs_item_size(l, i)); break; + case BTRFS_RAID_STRIPE_KEY: + print_raid_stripe_key(l, btrfs_item_size(l, i), + btrfs_item_ptr(l, i, struct btrfs_stripe_extent)); + break; } } } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 0755af0e53e3..f9bf591a0718 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -15,6 +15,7 @@ #include "fs.h" #include "accessors.h" #include "super.h" +#include "dir-item.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2637d6b157ff..e46774e8f49f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,6 +30,25 @@ #include "root-tree.h" #include "tree-checker.h" +enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) +{ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return BTRFS_QGROUP_MODE_DISABLED; + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) + return BTRFS_QGROUP_MODE_SIMPLE; + return BTRFS_QGROUP_MODE_FULL; +} + +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; +} + +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; +} + /* * Helpers to access qgroup reservation * @@ -146,16 +165,6 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) -{ - return (u64)(uintptr_t)qg; -} - -static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) -{ - return (struct btrfs_qgroup *)(uintptr_t)n->aux; -} - static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -180,34 +189,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, return NULL; } -/* must be called with qgroup_lock held */ +/* + * Add qgroup to the filesystem's qgroup tree. + * + * Must be called with qgroup_lock held and @prealloc preallocated. + * + * The control on the lifespan of @prealloc would be transfered to this + * function, thus caller should no longer touch @prealloc. + */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *prealloc, u64 qgroupid) { struct rb_node **p = &fs_info->qgroup_tree.rb_node; struct rb_node *parent = NULL; struct btrfs_qgroup *qgroup; + /* Caller must have pre-allocated @prealloc. */ + ASSERT(prealloc); + while (*p) { parent = *p; qgroup = rb_entry(parent, struct btrfs_qgroup, node); - if (qgroup->qgroupid < qgroupid) + if (qgroup->qgroupid < qgroupid) { p = &(*p)->rb_left; - else if (qgroup->qgroupid > qgroupid) + } else if (qgroup->qgroupid > qgroupid) { p = &(*p)->rb_right; - else + } else { + kfree(prealloc); return qgroup; + } } - qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); - if (!qgroup) - return ERR_PTR(-ENOMEM); - + qgroup = prealloc; qgroup->qgroupid = qgroupid; INIT_LIST_HEAD(&qgroup->groups); INIT_LIST_HEAD(&qgroup->members); INIT_LIST_HEAD(&qgroup->dirty); + INIT_LIST_HEAD(&qgroup->iterator); + INIT_LIST_HEAD(&qgroup->nested_iterator); rb_link_node(&qgroup->node, parent, p); rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); @@ -254,27 +275,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) /* * Add relation specified by two qgroups. * - * Must be called with qgroup_lock held. + * Must be called with qgroup_lock held, the ownership of @prealloc is + * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors */ -static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) +static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, + struct btrfs_qgroup *member, + struct btrfs_qgroup *parent) { - struct btrfs_qgroup_list *list; - - if (!member || !parent) + if (!member || !parent) { + kfree(prealloc); return -ENOENT; + } - list = kzalloc(sizeof(*list), GFP_ATOMIC); - if (!list) - return -ENOMEM; - - list->group = parent; - list->member = member; - list_add_tail(&list->next_group, &member->groups); - list_add_tail(&list->next_member, &parent->members); + prealloc->group = parent; + prealloc->member = member; + list_add_tail(&prealloc->next_group, &member->groups); + list_add_tail(&prealloc->next_member, &parent->members); return 0; } @@ -288,7 +308,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p * -ENOENT if one of the ids does not exist * <0 other errors */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +static int add_relation_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_list *prealloc, + u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; @@ -296,7 +318,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); - return __add_relation_rb(member, parent); + return __add_relation_rb(prealloc, member, parent); } /* Must be called with qgroup_lock held */ @@ -340,11 +362,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) { + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); } +static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot, + struct btrfs_qgroup_status_item *ptr) +{ + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); + fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -361,7 +394,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) u64 flags = 0; u64 rescan_progress = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!fs_info->quota_root) return 0; fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); @@ -411,14 +444,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) "old qgroup version, quota disabled"); goto out; } - if (btrfs_qgroup_status_generation(l, ptr) != - fs_info->generation) { + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + qgroup_read_enable_gen(fs_info, l, slot, ptr); + } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } - fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, - ptr); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -434,11 +467,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup_mark_inconsistent(fs_info); } if (!qgroup) { - qgroup = add_qgroup_rb(fs_info, found_key.offset); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); + struct btrfs_qgroup *prealloc; + + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { + ret = -ENOMEM; goto out; } + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) @@ -489,6 +525,8 @@ next1: if (ret) goto out; while (1) { + struct btrfs_qgroup_list *list = NULL; + slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); @@ -502,8 +540,14 @@ next1: goto next2; } - ret = add_relation_rb(fs_info, found_key.objectid, + list = kzalloc(sizeof(*list), GFP_KERNEL); + if (!list) { + ret = -ENOMEM; + goto out; + } + ret = add_relation_rb(fs_info, list, found_key.objectid, found_key.offset); + list = NULL; if (ret == -ENOENT) { btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx", @@ -522,13 +566,12 @@ next2: out: btrfs_free_path(path); fs_info->qgroup_flags |= flags; - if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && - ret >= 0) - ret = qgroup_rescan_init(fs_info, rescan_progress, 0); - - if (ret < 0) { + if (ret >= 0) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); + } else { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; @@ -550,7 +593,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) struct rb_node *node; bool ret = false; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup @@ -622,7 +665,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return ret; @@ -700,7 +743,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -719,7 +762,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -808,7 +851,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -854,7 +897,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -896,7 +939,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -949,7 +992,8 @@ out: return ret; } -int btrfs_quota_enable(struct btrfs_fs_info *fs_info) +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; @@ -959,8 +1003,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_key key; struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; + struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; struct ulist *ulist = NULL; + const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1063,13 +1109,18 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); - fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; + if (simple) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); + } else { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; @@ -1094,6 +1145,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) /* Release locks on tree_root before we access quota_root */ btrfs_release_path(path); + /* We should not have a stray @prealloc pointer. */ + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { @@ -1101,7 +1161,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, found_key.offset); + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); + prealloc = NULL; if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); btrfs_abort_transaction(trans, ret); @@ -1144,18 +1205,22 @@ out_add_root: goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - btrfs_abort_transaction(trans, ret); + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; goto out_free_path; } + qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); + prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } + fs_info->qgroup_enable_gen = trans->transid; + mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid @@ -1180,8 +1245,14 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (simple) + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); + /* Skip rescan for simple qgroups. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + goto out_free_path; + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1222,6 +1293,39 @@ out: else if (trans) ret = btrfs_end_transaction(trans); ulist_free(ulist); + kfree(prealloc); + return ret; +} + +/* + * It is possible to have outstanding ordered extents which reserved bytes + * before we disabled. We need to fully flush delalloc, ordered extents, and a + * commit to ensure that we don't leak such reservations, only to have them + * come back if we re-enable. + * + * - enable simple quotas + * - reserve space + * - release it, store rsv_bytes in OE + * - disable quotas + * - enable simple quotas (qgroup rsv are all 0) + * - OE finishes + * - run delayed refs + * - free rsv_bytes, resulting in miscounting or even underflow + */ +static int flush_reservations(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + int ret; + + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); + if (ret) + return ret; + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_commit_transaction(trans); + return ret; } @@ -1269,6 +1373,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); + ret = flush_reservations(fs_info); + if (ret) + goto out_unlock_cleaner; + /* * 1 For the root item * @@ -1295,6 +1403,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); @@ -1329,7 +1438,8 @@ out: if (ret && trans) btrfs_end_transaction(trans); else if (trans) - ret = btrfs_end_transaction(trans); + ret = btrfs_commit_transaction(trans); +out_unlock_cleaner: mutex_unlock(&fs_info->cleaner_mutex); return ret; @@ -1342,6 +1452,24 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->iterator)) + return; + + list_add_tail(&qgroup->iterator, head); +} + +static void qgroup_iterator_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); + list_del_init(&qgroup->iterator); + } +} + /* * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. @@ -1356,14 +1484,12 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, * * Caller should hold fs_info->qgroup_lock. */ -static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 ref_root, +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; + struct btrfs_qgroup *cur; + LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; int ret = 0; @@ -1371,53 +1497,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, if (!qgroup) goto out; - qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < num_bytes); - qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; - - if (sign > 0) - qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); - else - qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(cur, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = unode_aux_to_qgroup(unode); qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); else qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - qgroup->excl_cmpr += sign * num_bytes; qgroup_dirty(fs_info, qgroup); - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + /* Append parent qgroups to @qgroup_list. */ + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; out: + qgroup_iterator_clean(&qgroup_list); return ret; } @@ -1434,8 +1537,7 @@ out: * Return < 0 for other error. */ static int quick_update_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 src, u64 dst, - int sign) + u64 src, u64 dst, int sign) { struct btrfs_qgroup *qgroup; int ret = 1; @@ -1446,8 +1548,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info, goto out; if (qgroup->excl == qgroup->rfer) { ret = 0; - err = __qgroup_excl_accounting(fs_info, tmp, dst, - qgroup, sign); + err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); if (err < 0) { ret = err; goto out; @@ -1459,28 +1560,19 @@ out: return ret; } -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst) +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; - unsigned int nofs_flag; + struct btrfs_qgroup_list *prealloc = NULL; int ret = 0; /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1501,6 +1593,11 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } } + prealloc = kzalloc(sizeof(*list), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + goto out; + } ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out; @@ -1512,16 +1609,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = __add_relation_rb(member, parent); + ret = __add_relation_rb(prealloc, member, parent); + prealloc = NULL; if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; } - ret = quick_update_accounting(fs_info, tmp, src, dst, 1); + ret = quick_update_accounting(fs_info, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: + kfree(prealloc); mutex_unlock(&fs_info->qgroup_ioctl_lock); - ulist_free(tmp); return ret; } @@ -1532,19 +1630,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; bool found = false; - unsigned int nofs_flag; int ret = 0; int ret2; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; @@ -1582,11 +1671,10 @@ delete_item: if (found) { spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); - ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + ret = quick_update_accounting(fs_info, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); } out: - ulist_free(tmp); return ret; } @@ -1608,8 +1696,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *prealloc = NULL; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) + return 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1622,21 +1714,25 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + goto out; + } + ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out; spin_lock(&fs_info->qgroup_lock); - qgroup = add_qgroup_rb(fs_info, qgroupid); + qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); spin_unlock(&fs_info->qgroup_lock); + prealloc = NULL; - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - goto out; - } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + kfree(prealloc); return ret; } @@ -1771,6 +1867,17 @@ out: return ret; } +/* + * Inform qgroup to trace one dirty extent, its info is recorded in @record. + * So qgroup can account it at transaction committing time. + * + * No lock version, caller must acquire delayed ref lock and allocated memory, + * then call btrfs_qgroup_trace_extent_post() after exiting lock context. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1780,6 +1887,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *entry; u64 bytenr = record->bytenr; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); @@ -1806,12 +1916,35 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 0; } +/* + * Post handler after qgroup_trace_extent_nolock(). + * + * NOTE: Current qgroup does the expensive backref walk at transaction + * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming + * new transaction. + * This is designed to allow btrfs_find_all_roots() to get correct new_roots + * result. + * + * However for old_roots there is no need to do backref walk at that time, + * since we search commit roots to walk backref and result will always be + * correct. + * + * Due to the nature of no lock version, we can't do backref there. + * So we must call btrfs_qgroup_trace_extent_post() after exiting + * spinlock context. + * + * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result + * using current root, then we can move all expensive backref walk out of + * transaction committing, but not now as qgroup accounting will be wrong again. + */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; + if (!btrfs_qgroup_full_accounting(trans->fs_info)) + return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed @@ -1859,6 +1992,19 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, return 0; } +/* + * Inform qgroup to trace one dirty extent, specified by @bytenr and + * @num_bytes. + * So qgroup can account it at commit trans time. + * + * Better encapsulated version, with memory allocation and backref walk for + * commit roots. + * So this can sleep. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { @@ -1867,8 +2013,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_delayed_ref_root *delayed_refs; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) - || bytenr == 0 || num_bytes == 0) + if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0; record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) @@ -1889,6 +2034,12 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, return btrfs_qgroup_trace_extent_post(trans, record); } +/* + * Inform qgroup to trace all leaf items of data + * + * Return 0 for success + * Return <0 for error(ENOMEM) + */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { @@ -1900,7 +2051,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, u64 bytenr, num_bytes; /* We can be called directly from walk_up_proc() */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; for (i = 0; i < nr; i++) { @@ -2276,7 +2427,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, int level; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* Wrong parameter order */ @@ -2319,6 +2470,16 @@ out: return ret; } +/* + * Inform qgroup to trace a whole subtree, including all its child tree + * blocks and data. + * The root tree block is specified by @root_eb. + * + * Normally used by relocation(tree block swap) and subvolume deletion. + * + * Return 0 for success + * Return <0 for error(ENOMEM or tree search error) + */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) @@ -2333,7 +2494,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); BUG_ON(root_eb == NULL); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; spin_lock(&fs_info->qgroup_lock); @@ -2445,62 +2606,64 @@ out: return ret; } +static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->nested_iterator)) + return; + + list_add_tail(&qgroup->nested_iterator, head); +} + +static void qgroup_iterator_nested_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); + list_del_init(&qgroup->nested_iterator); + } +} + #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - struct ulist *qgroups, u64 seq, int update_old) +static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct list_head *qgroups, + u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret = 0; if (!roots) - return 0; + return; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + LIST_HEAD(tmp); + qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - ulist_reinit(tmp); - ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + qgroup_iterator_nested_add(qgroups, qg); + qgroup_iterator_add(&tmp, qg); + list_for_each_entry(qg, &tmp, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(tmp_unode); if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); + list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; + qgroup_iterator_nested_add(qgroups, glist->group); + qgroup_iterator_add(&tmp, glist->group); } } + qgroup_iterator_clean(&tmp); } - return 0; } /* @@ -2539,22 +2702,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * But this time we don't need to consider other things, the codes and logic * is easy to understand now. */ -static int qgroup_update_counters(struct btrfs_fs_info *fs_info, - struct ulist *qgroups, - u64 nr_old_roots, - u64 nr_new_roots, - u64 num_bytes, u64 seq) +static void qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct list_head *qgroups, u64 nr_old_roots, + u64 nr_new_roots, u64 num_bytes, u64 seq) { - struct ulist_node *unode; - struct ulist_iterator uiter; struct btrfs_qgroup *qg; - u64 cur_new_count, cur_old_count; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(qgroups, &uiter))) { + list_for_each_entry(qg, qgroups, nested_iterator) { + u64 cur_new_count, cur_old_count; bool dirty = false; - qg = unode_aux_to_qgroup(unode); cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); @@ -2625,7 +2782,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, if (dirty) qgroup_dirty(fs_info, qg); } - return 0; } /* @@ -2662,8 +2818,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct ulist *qgroups = NULL; - struct ulist *tmp = NULL; + LIST_HEAD(qgroups); u64 seq; u64 nr_new_roots = 0; u64 nr_old_roots = 0; @@ -2673,7 +2828,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (!btrfs_qgroup_full_accounting(fs_info) || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; @@ -2697,17 +2852,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) { - ret = -ENOMEM; - goto out_free; - } - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out_free; - } - mutex_lock(&fs_info->qgroup_rescan_lock); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { @@ -2722,29 +2866,27 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, seq = fs_info->qgroup_seq; /* Update old refcnts using old_roots */ - ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, - UPDATE_OLD); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); /* Update new refcnts using new_roots */ - ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, - UPDATE_NEW); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); - qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); /* + * We're done using the iterator, release all its qgroups while holding + * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() + * and trigger use-after-free accesses to qgroups. + */ + qgroup_iterator_nested_clean(&qgroups); + + /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; -out: spin_unlock(&fs_info->qgroup_lock); out_free: - ulist_free(tmp); - ulist_free(qgroups); ulist_free(old_roots); ulist_free(new_roots); return ret; @@ -2761,6 +2903,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) u64 qgroup_to_skip; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return 0; + delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; while ((node = rb_first(&delayed_refs->dirty_extent_root))) { @@ -2876,7 +3021,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_enabled(fs_info)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; @@ -2889,6 +3034,47 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) return ret; } +static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, + u64 inode_rootid, + struct btrfs_qgroup_inherit **inherit) +{ + int i = 0; + u64 num_qgroups = 0; + struct btrfs_qgroup *inode_qg; + struct btrfs_qgroup_list *qg_list; + struct btrfs_qgroup_inherit *res; + size_t struct_sz; + u64 *qgids; + + if (*inherit) + return -EEXIST; + + inode_qg = find_qgroup_rb(fs_info, inode_rootid); + if (!inode_qg) + return -ENOENT; + + num_qgroups = list_count_nodes(&inode_qg->groups); + + if (!num_qgroups) + return 0; + + struct_sz = struct_size(res, qgroups, num_qgroups); + if (struct_sz == SIZE_MAX) + return -ERANGE; + + res = kzalloc(struct_sz, GFP_NOFS); + if (!res) + return -ENOMEM; + res->num_qgroups = num_qgroups; + qgids = res->qgroups; + + list_for_each_entry(qg_list, &inode_qg->groups, next_group) + qgids[i] = qg_list->group->qgroupid; + + *inherit = res; + return 0; +} + /* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will @@ -2896,7 +3082,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit) + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit) { int ret = 0; int i; @@ -2906,10 +3093,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; + struct btrfs_qgroup *prealloc; + struct btrfs_qgroup_list **qlist_prealloc = NULL; + bool free_inherit = false; bool need_rescan = false; u32 level_size = 0; u64 nums; + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) + return -ENOMEM; + /* * There are only two callers of this function. * @@ -2929,7 +3123,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_enabled(fs_info)) goto out; quota_root = fs_info->quota_root; @@ -2938,6 +3132,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { + ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); + if (ret) + goto out; + free_inherit = true; + } + if (inherit) { i_qgroups = (u64 *)(inherit + 1); nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + @@ -2982,16 +3183,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } ret = 0; - } + qlist_prealloc = kcalloc(inherit->num_qgroups, + sizeof(struct btrfs_qgroup_list *), + GFP_NOFS); + if (!qlist_prealloc) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < inherit->num_qgroups; i++) { + qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), + GFP_NOFS); + if (!qlist_prealloc[i]) { + ret = -ENOMEM; + goto out; + } + } + } spin_lock(&fs_info->qgroup_lock); - dstgroup = add_qgroup_rb(fs_info, objectid); - if (IS_ERR(dstgroup)) { - ret = PTR_ERR(dstgroup); - goto unlock; - } + dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); + prealloc = NULL; if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { dstgroup->lim_flags = inherit->lim.flags; @@ -3003,7 +3216,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); } - if (srcid) { + if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; @@ -3038,7 +3251,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { if (*i_qgroups) { - ret = add_relation_rb(fs_info, objectid, *i_qgroups); + ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, + *i_qgroups); + qlist_prealloc[i] = NULL; if (ret) goto unlock; } @@ -3102,6 +3317,14 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) qgroup_mark_inconsistent(fs_info); + if (qlist_prealloc) { + for (int i = 0; i < inherit->num_qgroups; i++) + kfree(qlist_prealloc[i]); + kfree(qlist_prealloc); + } + if (free_inherit) + kfree(inherit); + kfree(prealloc); return ret; } @@ -3125,8 +3348,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; - struct ulist_node *unode; - struct ulist_iterator uiter; + LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return 0; @@ -3146,49 +3368,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, if (!qgroup) goto out; - /* - * in a first step, we check all affected qgroups if any limits would - * be exceeded - */ - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); - - if (enforce && !qgroup_check_limits(qg, num_bytes)) { + if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { ret = -EDQUOT; goto out; } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } + ret = 0; /* * no limits exceeded, now record the reservation into all qgroups */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; - - qg = unode_aux_to_qgroup(unode); - - qgroup_rsv_add(fs_info, qg, num_bytes, type); - } + list_for_each_entry(qgroup, &qgroup_list, iterator) + qgroup_rsv_add(fs_info, qgroup, num_bytes, type); out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); return ret; } @@ -3207,9 +3408,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; + LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return; @@ -3237,30 +3436,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, */ num_bytes = qgroup->rsv.values[type]; - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); - - qgroup_rsv_release(fs_info, qg, num_bytes, type); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; + qgroup_rsv_release(fs_info, qgroup, num_bytes, type); + list_for_each_entry(glist, &qgroup->groups, next_group) { + qgroup_iterator_add(&qgroup_list, glist->group); } } - out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } @@ -3295,6 +3481,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int slot; int ret; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + mutex_lock(&fs_info->qgroup_rescan_lock); extent_root = btrfs_extent_root(fs_info, fs_info->qgroup_rescan_progress.objectid); @@ -3375,10 +3564,15 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { - return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || - !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; + if (btrfs_fs_closing(fs_info)) + return true; + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + return true; + if (!btrfs_qgroup_enabled(fs_info)) + return true; + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) + return true; + return false; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3392,6 +3586,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) bool stopped = false; bool did_leaf_rescans = false; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; + path = btrfs_alloc_path(); if (!path) goto out; @@ -3495,6 +3692,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, { int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); + return -EINVAL; + } + if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & @@ -3525,7 +3727,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; - } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; } @@ -3546,7 +3748,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, - btrfs_qgroup_rescan_worker, NULL, NULL); + btrfs_qgroup_rescan_worker, NULL); return 0; } @@ -3590,15 +3792,16 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) * going to clear all tracking information for a clean start. */ - trans = btrfs_join_transaction(fs_info->fs_root); - if (IS_ERR(trans)) { + trans = btrfs_attach_transaction_barrier(fs_info->fs_root); + if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - if (ret) { - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return ret; + } else if (trans != ERR_PTR(-ENOENT)) { + ret = btrfs_commit_transaction(trans); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } } qgroup_rescan_zero_tracking(fs_info); @@ -3757,9 +3960,11 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; goto out; } @@ -3781,7 +3986,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, u64 to_reserve; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || len == 0) return 0; @@ -3852,13 +4057,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed_ret) { struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; - int freed = 0; + u64 freed = 0; int ret; extent_changeset_init(&changeset); @@ -3899,7 +4105,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, } btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); - ret = freed; + if (freed_ret) + *freed_ret = freed; + ret = 0; out: extent_changeset_release(&changeset); return ret; @@ -3907,19 +4115,23 @@ out: static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, - int free) + u64 *released, int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) - return 0; + if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { + extent_changeset_init(&changeset); + return clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + } /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -3934,7 +4146,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); - ret = changeset.bytes_changed; + if (released) + *released = changeset.bytes_changed; out: extent_changeset_release(&changeset); return ret; @@ -3953,9 +4166,10 @@ out: * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_free_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed) { - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); } /* @@ -3973,9 +4187,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) { - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, @@ -4024,7 +4238,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; @@ -4061,11 +4275,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); } +/* + * Per-transaction meta reservation should be all freed at transaction commit + * time + */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4081,7 +4299,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4101,9 +4319,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { struct btrfs_qgroup *qgroup; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; + LIST_HEAD(qgroup_list); if (num_bytes == 0) return; @@ -4114,39 +4330,36 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; - qgroup_rsv_release(fs_info, qg, num_bytes, + qgroup_rsv_release(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - qgroup_rsv_add(fs_info, qg, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS); - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + if (!sb_rdonly(fs_info->sb)) + qgroup_rsv_add(fs_info, qgroup, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS); + + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } +/* + * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. + * + * This is called when preallocated meta reservation needs to be used. + * Normally after btrfs_join_transaction() call. + */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ @@ -4254,7 +4467,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > @@ -4364,7 +4577,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, int ret = 0; int i; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (!is_fstree(root->root_key.objectid) || !root->reloc_root) return 0; @@ -4447,3 +4660,61 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) } *root = RB_ROOT; } + +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) +{ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return; + + if (!is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); +} + +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + struct btrfs_squota_delta *delta) +{ + int ret; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *qg; + LIST_HEAD(qgroup_list); + u64 root = delta->root; + u64 num_bytes = delta->num_bytes; + const int sign = (delta->is_inc ? 1 : -1); + + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return 0; + + if (!is_fstree(root)) + return 0; + + /* If the extent predates enabling quotas, don't count it. */ + if (delta->generation < fs_info->qgroup_enable_gen) + return 0; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, root); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + + ret = 0; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qg, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; + + qg->excl += num_bytes * sign; + qg->rfer += num_bytes * sign; + qgroup_dirty(fs_info, qg); + + list_for_each_entry(glist, &qg->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); + } + qgroup_iterator_clean(&qgroup_list); + +out: + spin_unlock(&fs_info->qgroup_lock); + return ret; +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 7bffa10589d6..be18c862e64e 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -101,8 +101,15 @@ * subtree rescan for them. */ -#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) -#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) +/* + * These flags share the flags field of the btrfs_qgroup_status_item with the + * persisted flags defined in btrfs_tree.h. + * + * To minimize the chance of collision with new persisted status flags, these + * count backwards from the MSB. + */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) /* * Record a dirty extent, and info qgroup to update quota on it @@ -220,6 +227,33 @@ struct btrfs_qgroup { struct list_head groups; /* groups this group is member of */ struct list_head members; /* groups that are members of this group */ struct list_head dirty; /* dirty groups */ + + /* + * For qgroup iteration usage. + * + * The iteration list should always be empty until qgroup_iterator_add() + * is called. And should be reset to empty after the iteration is + * finished. + */ + struct list_head iterator; + + /* + * For nested iterator usage. + * + * Here we support at most one level of nested iterator calls like: + * + * LIST_HEAD(all_qgroups); + * { + * LIST_HEAD(local_qgroups); + * qgroup_iterator_add(local_qgroups, qg); + * qgroup_iterator_nested_add(all_qgroups, qg); + * do_some_work(local_qgroups); + * qgroup_iterator_clean(local_qgroups); + * } + * do_some_work(all_qgroups); + * qgroup_iterator_nested_clean(all_qgroups); + */ + struct list_head nested_iterator; struct rb_node node; /* tree of qgroups */ /* @@ -235,6 +269,19 @@ struct btrfs_qgroup { struct kobject kobj; }; +struct btrfs_squota_delta { + /* The fstree root this delta counts against. */ + u64 root; + /* The number of bytes in the extent being counted. */ + u64 num_bytes; + /* The generation the extent was created in. */ + u64 generation; + /* Whether we are using or freeing the extent. */ + bool is_inc; + /* Whether the extent is data or metadata. */ + bool is_data; +}; + static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) { return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); @@ -249,14 +296,23 @@ enum { ENUM_BIT(QGROUP_FREE), }; -int btrfs_quota_enable(struct btrfs_fs_info *fs_info); +enum btrfs_qgroup_mode { + BTRFS_QGROUP_MODE_DISABLED, + BTRFS_QGROUP_MODE_FULL, + BTRFS_QGROUP_MODE_SIMPLE +}; + +enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info); +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); @@ -267,80 +323,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -/* - * Inform qgroup to trace one dirty extent, its info is recorded in @record. - * So qgroup can account it at transaction committing time. - * - * No lock version, caller must acquire delayed ref lock and allocated memory, - * then call btrfs_qgroup_trace_extent_post() after exiting lock context. - * - * Return 0 for success insert - * Return >0 for existing record, caller can free @record safely. - * Error is not possible - */ int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record); - -/* - * Post handler after qgroup_trace_extent_nolock(). - * - * NOTE: Current qgroup does the expensive backref walk at transaction - * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming - * new transaction. - * This is designed to allow btrfs_find_all_roots() to get correct new_roots - * result. - * - * However for old_roots there is no need to do backref walk at that time, - * since we search commit roots to walk backref and result will always be - * correct. - * - * Due to the nature of no lock version, we can't do backref there. - * So we must call btrfs_qgroup_trace_extent_post() after exiting - * spinlock context. - * - * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result - * using current root, then we can move all expensive backref walk out of - * transaction committing, but not now as qgroup accounting will be wrong again. - */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord); - -/* - * Inform qgroup to trace one dirty extent, specified by @bytenr and - * @num_bytes. - * So qgroup can account it at commit trans time. - * - * Better encapsulated version, with memory allocation and backref walk for - * commit roots. - * So this can sleep. - * - * Return 0 if the operation is done. - * Return <0 for error, like memory allocation failure or invalid parameter - * (NULL trans) - */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); - -/* - * Inform qgroup to trace all leaf items of data - * - * Return 0 for success - * Return <0 for error(ENOMEM) - */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb); -/* - * Inform qgroup to trace a whole subtree, including all its child tree - * blocks and data. - * The root tree block is specified by @root_eb. - * - * Normally used by relocation(tree block swap) and subvolume deletion. - * - * Return 0 for success - * Return <0 for error(ENOMEM or tree search error) - */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); @@ -350,7 +342,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit); + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); @@ -363,10 +356,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, - u64 len); + u64 len, u64 *freed); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, @@ -408,20 +401,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, BTRFS_QGROUP_RSV_META_PREALLOC); } -/* - * Per-transaction meta reservation should be all freed at transaction commit - * time - */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); - -/* - * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. - * - * This is called when preallocated meta reservation needs to be used. - * Normally after btrfs_join_transaction() call. - */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); - void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); /* btrfs_qgroup_swapped_blocks related functions */ @@ -439,5 +420,8 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + struct btrfs_squota_delta *delta); #endif diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c new file mode 100644 index 000000000000..9589362acfbf --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#include <linux/btrfs_tree.h> +#include "ctree.h" +#include "fs.h" +#include "accessors.h" +#include "transaction.h" +#include "disk-io.h" +#include "raid-stripe-tree.h" +#include "volumes.h" +#include "misc.h" +#include "print-tree.h" + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u64 found_start; + u64 found_end; + u64 end = start + length; + int slot; + int ret; + + if (!stripe_root) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = start; + key.type = BTRFS_RAID_STRIPE_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + + /* That stripe ends before we start, we're done. */ + if (found_end <= start) + break; + + trace_btrfs_raid_extent_delete(fs_info, start, end, + found_start, found_end); + + ASSERT(found_start >= start && found_end <= end); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + break; + + btrfs_release_path(path); + } + + btrfs_free_path(path); + return ret; +} + +static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key stripe_key; + struct btrfs_root *stripe_root = fs_info->stripe_root; + const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); + u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type); + struct btrfs_stripe_extent *stripe_extent; + const size_t item_size = struct_size(stripe_extent, strides, num_stripes); + int ret; + + stripe_extent = kzalloc(item_size, GFP_NOFS); + if (!stripe_extent) { + btrfs_abort_transaction(trans, -ENOMEM); + btrfs_end_transaction(trans); + return -ENOMEM; + } + + trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size, + num_stripes); + btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding); + for (int i = 0; i < num_stripes; i++) { + u64 devid = bioc->stripes[i].dev->devid; + u64 physical = bioc->stripes[i].physical; + u64 length = bioc->stripes[i].length; + struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; + + if (length == 0) + length = bioc->size; + + btrfs_set_stack_raid_stride_devid(raid_stride, devid); + btrfs_set_stack_raid_stride_physical(raid_stride, physical); + } + + stripe_key.objectid = bioc->logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = bioc->size; + + ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, + item_size); + if (ret) + btrfs_abort_transaction(trans, ret); + + kfree(stripe_extent); + + return ret; +} + +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_io_context *bioc; + int ret; + + if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE)) + return 0; + + list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) { + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) + return ret; + } + + while (!list_empty(&ordered_extent->bioc_list)) { + bioc = list_first_entry(&ordered_extent->bioc_list, + typeof(*bioc), rst_ordered_entry); + list_del(&bioc->rst_ordered_entry); + btrfs_put_bioc(bioc); + } + + return 0; +} + +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe) +{ + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_stripe_extent *stripe_extent; + struct btrfs_key stripe_key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + const u64 end = logical + *length; + int num_stripes; + u8 encoding; + u64 offset; + u64 found_logical; + u64 found_length; + u64 found_end; + int slot; + int ret; + + stripe_key.objectid = logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (stripe->is_scrub) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + + ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret) { + if (path->slots[0] != 0) + path->slots[0]--; + } + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + found_logical = found_key.objectid; + found_length = found_key.offset; + found_end = found_logical + found_length; + + if (found_logical > end) { + ret = -ENOENT; + goto out; + } + + if (in_range(logical, found_logical, found_length)) + break; + + ret = btrfs_next_item(stripe_root, path); + if (ret) + goto out; + } + + offset = logical - found_logical; + + /* + * If we have a logically contiguous, but physically non-continuous + * range, we need to split the bio. Record the length after which we + * must split the bio. + */ + if (end > found_end) + *length -= end - found_end; + + num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); + stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent); + + if (encoding != btrfs_bg_flags_to_raid_index(map_type)) { + ret = -EUCLEAN; + btrfs_handle_fs_error(fs_info, ret, + "on-disk stripe encoding %d doesn't match RAID index %d", + encoding, + btrfs_bg_flags_to_raid_index(map_type)); + goto out; + } + + for (int i = 0; i < num_stripes; i++) { + struct btrfs_raid_stride *stride = &stripe_extent->strides[i]; + u64 devid = btrfs_raid_stride_devid(leaf, stride); + u64 physical = btrfs_raid_stride_physical(leaf, stride); + + if (devid != stripe->dev->devid) + continue; + + if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i) + continue; + + stripe->physical = physical + offset; + + trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, + stripe->physical, devid); + + ret = 0; + goto free_path; + } + + /* If we're here, we haven't found the requested devid in the stripe. */ + ret = -ENOENT; +out: + if (ret > 0) + ret = -ENOENT; + if (ret && ret != -EIO && !stripe->is_scrub) { + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) + btrfs_print_tree(leaf, 1); + btrfs_err(fs_info, + "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", + logical, logical + *length, stripe->dev->devid, + btrfs_bg_type_to_raid_name(map_type)); + } +free_path: + btrfs_free_path(path); + + return ret; +} diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h new file mode 100644 index 000000000000..cdb58b38fcb5 --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#ifndef BTRFS_RAID_STRIPE_TREE_H +#define BTRFS_RAID_STRIPE_TREE_H + +#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID1_MASK | \ + BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10) + +struct btrfs_io_context; +struct btrfs_io_stripe; +struct btrfs_ordered_extent; +struct btrfs_trans_handle; + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length); +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe); +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent); + +static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, + u64 map_type) +{ + u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK; + u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) + return false; + + if (type != BTRFS_BLOCK_GROUP_DATA) + return false; + + if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK) + return true; + + return false; +} + +static inline int btrfs_num_raid_stripes(u32 item_size) +{ + return (item_size - offsetof(struct btrfs_stripe_extent, strides)) / + sizeof(struct btrfs_raid_stride); +} + +#endif diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0249ea52bb80..3e014b9370a3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -584,8 +584,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING || - last->operation == BTRFS_RBIO_READ_REBUILD) + if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; return 1; @@ -784,10 +783,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); - if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, recover_rbio_work_locked); - else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { - steal_rbio(rbio, next); + if (next->operation == BTRFS_RBIO_READ_REBUILD) { start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); @@ -1517,11 +1513,11 @@ static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_scrub_read_recover_enabled()) { + if (trace_raid56_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + trace_raid56_read(rbio, bio, &trace_info); } submit_bio(bio); } @@ -1698,8 +1694,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1763,8 +1758,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1897,8 +1891,7 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) goto out; } - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); @@ -2112,8 +2105,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) goto error; } - ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, - rbio->csum_buf, rbio->csum_bitmap, false); + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) @@ -2198,11 +2191,11 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_write_end_io; - if (trace_raid56_write_stripe_enabled()) { + if (trace_raid56_write_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); + trace_raid56_write(rbio, bio, &trace_info); } submit_bio(bio); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0e84c9c9293f..45e6ff78316f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -14,7 +14,6 @@ enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, }; struct btrfs_raid_bio { diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 95d28497de7c..6486f0d7e993 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, ret = add_shared_data_ref(fs_info, offset, count, key->objectid, key->offset); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: btrfs_err(fs_info, "invalid key type in iref"); ret = -EINVAL; @@ -652,7 +655,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, } /* - * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * Called when we modify a ref for a bytenr. * * This will add an action item to the given bytenr and do sanity checks to make * sure we haven't messed something up. If we are making a new allocation and @@ -681,10 +684,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.owning_root; + ref_root = generic_ref->tree_ref.ref_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.owning_root; + ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; } @@ -791,6 +794,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); + kfree(re); goto out_unlock; } else if (be->num_refs == 0) { btrfs_err(fs_info, @@ -800,6 +804,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); + kfree(re); goto out_unlock; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 0474bbe39da7..f88b0c2ac3fe 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -25,13 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, const u64 olen, int no_time_update) { - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); if (!no_time_update) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } /* * We round up to the block size at eof when determining which @@ -44,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 46c3c1d57266..f5d9e5f74a52 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -111,8 +111,8 @@ struct tree_block { }; /* Use rb_simple_node for search/insert */ u64 owner; struct btrfs_key key; - unsigned int level:8; - unsigned int key_ready:1; + u8 level; + bool key_ready; }; #define MAX_EXTENTS 128 @@ -122,6 +122,13 @@ struct file_extent_cluster { u64 end; u64 boundary[MAX_EXTENTS]; unsigned int nr; + u64 owning_root; +}; + +/* Stages of data relocation. */ +enum reloc_stage { + MOVE_DATA_EXTENTS, + UPDATE_DATA_PTRS }; struct reloc_control { @@ -155,16 +162,12 @@ struct reloc_control { u64 search_start; u64 extents_found; - unsigned int stage:8; - unsigned int create_reloc_tree:1; - unsigned int merge_reloc_tree:1; - unsigned int found_file_extent:1; + enum reloc_stage stage; + bool create_reloc_tree; + bool merge_reloc_tree; + bool found_file_extent; }; -/* stages of data relocation */ -#define MOVE_DATA_EXTENTS 0 -#define UPDATE_DATA_PTRS 1 - static void mark_block_processed(struct reloc_control *rc, struct btrfs_backref_node *node) { @@ -180,13 +183,6 @@ static void mark_block_processed(struct reloc_control *rc, node->processed = 1; } - -static void mapping_tree_init(struct mapping_tree *tree) -{ - tree->rb_root = RB_ROOT; - spin_lock_init(&tree->lock); -} - /* * walk up backref nodes until reach node presents tree root */ @@ -299,7 +295,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans, return 1; } -static bool reloc_root_is_dead(struct btrfs_root *root) +static bool reloc_root_is_dead(const struct btrfs_root *root) { /* * Pair with set_bit/clear_bit in clean_dirty_subvols and @@ -320,7 +316,7 @@ static bool reloc_root_is_dead(struct btrfs_root *root) * from no reloc root. But btrfs_should_ignore_reloc_root() below is a * special case. */ -static bool have_reloc_root(struct btrfs_root *root) +static bool have_reloc_root(const struct btrfs_root *root) { if (reloc_root_is_dead(root)) return false; @@ -329,31 +325,30 @@ static bool have_reloc_root(struct btrfs_root *root) return true; } -int btrfs_should_ignore_reloc_root(struct btrfs_root *root) +bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) { struct btrfs_root *reloc_root; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - return 0; + return false; /* This root has been merged with its reloc tree, we can ignore it */ if (reloc_root_is_dead(root)) - return 1; + return true; reloc_root = root->reloc_root; if (!reloc_root) - return 0; + return false; if (btrfs_header_generation(reloc_root->commit_root) == root->fs_info->running_transaction->transid) - return 0; + return false; /* - * if there is reloc tree and it was created in previous - * transaction backref lookup can find the reloc tree, - * so backref node for the fs tree root is useless for - * relocation. + * If there is reloc tree and it was created in previous transaction + * backref lookup can find the reloc tree, so backref node for the fs + * tree root is useless for relocation. */ - return 1; + return true; } /* @@ -466,6 +461,7 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cached. */ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( + struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_key *node_key, int level, u64 bytenr) { @@ -499,8 +495,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( /* Breadth-first search to build backref cache */ do { - ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, - cur); + ret = btrfs_backref_add_tree_node(trans, cache, path, iter, + node_key, cur); if (ret < 0) { err = ret; goto out; @@ -546,7 +542,7 @@ out: */ static int clone_backref_node(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct btrfs_root *src, + const struct btrfs_root *src, struct btrfs_root *dest) { struct btrfs_root *reloc_root = src->reloc_root; @@ -631,7 +627,7 @@ fail: /* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __must_check __add_reloc_root(struct btrfs_root *root) +static int __add_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; @@ -1158,7 +1154,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(leaf, fi); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - num_bytes, parent); + num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); @@ -1169,7 +1165,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, parent); + num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); @@ -1180,15 +1176,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } } if (dirty) - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } -static noinline_for_stack -int memcmp_node_keys(struct extent_buffer *eb, int slot, - struct btrfs_path *path, int level) +static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, + int slot, const struct btrfs_path *path, + int level) { struct btrfs_disk_key key1; struct btrfs_disk_key key2; @@ -1373,16 +1369,17 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(path->nodes[level]); + btrfs_mark_buffer_dirty(trans, path->nodes[level]); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, - blocksize, path->nodes[level]->start); + blocksize, path->nodes[level]->start, + src->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); @@ -1391,7 +1388,7 @@ again: break; } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - blocksize, 0); + blocksize, 0, dest->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); @@ -1400,8 +1397,9 @@ again: break; } + /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, - blocksize, path->nodes[level]->start); + blocksize, path->nodes[level]->start, 0); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); @@ -1410,8 +1408,9 @@ again: break; } + /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, - blocksize, 0); + blocksize, 0, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); @@ -1517,8 +1516,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, * [min_key, max_key) */ static int invalidate_extent_cache(struct btrfs_root *root, - struct btrfs_key *min_key, - struct btrfs_key *max_key) + const struct btrfs_key *min_key, + const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = NULL; @@ -1896,7 +1895,7 @@ again: } } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { reloc_root = list_entry(rc->reloc_roots.next, @@ -2516,11 +2515,12 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); - btrfs_mark_buffer_dirty(upper->eb); + btrfs_mark_buffer_dirty(trans, upper->eb); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, node->eb->start, blocksize, - upper->eb->start); + upper->eb->start, + btrfs_header_owner(upper->eb)); btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb), root->root_key.objectid, false); @@ -2632,7 +2632,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) u32 blocksize = rc->extent_root->fs_info->nodesize; if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2659,7 +2659,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, else btrfs_node_key_to_cpu(eb, &block->key, 0); free_extent_buffer(eb); - block->key_ready = 1; + block->key_ready = true; return 0; } @@ -2803,7 +2803,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { - node = build_backref_tree(rc, &block->key, + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -2829,7 +2829,7 @@ out_free_blocks: static noinline_for_stack int prealloc_file_extent_cluster( struct btrfs_inode *inode, - struct file_extent_cluster *cluster) + const struct file_extent_cluster *cluster) { u64 alloc_hint = 0; u64 start; @@ -2964,7 +2964,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod /* * Allow error injection to test balance/relocation cancellation */ -noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || atomic_read(&fs_info->reloc_cancel_req) || @@ -2972,7 +2972,7 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); -static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, +static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, int cluster_nr) { /* Last extent, use cluster end directly */ @@ -2984,7 +2984,7 @@ static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, } static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, - struct file_extent_cluster *cluster, + const struct file_extent_cluster *cluster, int *cluster_nr, unsigned long page_index) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3006,9 +3006,6 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, if (!page) return -ENOMEM; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto release_page; if (PageReadahead(page)) page_cache_async_readahead(inode->i_mapping, ra, NULL, @@ -3024,6 +3021,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, } } + /* + * We could have lost page private when we dropped the lock to read the + * page above, make sure we set_page_extent_mapped here so we have any + * of the subpage blocksize stuff we need in place. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto release_page; + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; @@ -3113,7 +3119,7 @@ release_page: } static int relocate_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) + const struct file_extent_cluster *cluster) { u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; @@ -3151,11 +3157,12 @@ out: return ret; } -static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, - struct file_extent_cluster *cluster) +static noinline_for_stack int relocate_data_extent(struct inode *inode, + const struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) { int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { ret = relocate_file_extent_cluster(inode, cluster); @@ -3164,8 +3171,38 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, cluster->nr = 0; } - if (!cluster->nr) + /* + * Under simple quotas, we set root->relocation_src_root when we find + * the extent. If adjacent extents have different owners, we can't merge + * them while relocating. Handle this by storing the owning root that + * started a cluster and if we see an extent from a different root break + * cluster formation (just like the above case of non-adjacent extents). + * + * Without simple quotas, relocation_src_root is always 0, so we should + * never see a mismatch, and it should have no effect on relocation + * clusters. + */ + if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { + u64 tmp = root->relocation_src_root; + + /* + * root->relocation_src_root is the state that actually affects + * the preallocation we do here, so set it to the root owning + * the cluster we need to relocate. + */ + root->relocation_src_root = cluster->owning_root; + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + /* And reset it back for the current extent's owning root. */ + root->relocation_src_root = tmp; + } + + if (!cluster->nr) { cluster->start = extent_key->objectid; + cluster->owning_root = root->relocation_src_root; + } else BUG_ON(cluster->nr >= MAX_EXTENTS); cluster->end = extent_key->objectid + extent_key->offset - 1; @@ -3186,7 +3223,7 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, * the major work is getting the generation and level of the block */ static int add_tree_block(struct reloc_control *rc, - struct btrfs_key *extent_key, + const struct btrfs_key *extent_key, struct btrfs_path *path, struct rb_root *blocks) { @@ -3250,12 +3287,13 @@ static int add_tree_block(struct reloc_control *rc, if (type == BTRFS_TREE_BLOCK_REF_KEY) owner = btrfs_extent_inline_ref_offset(eb, iref); } - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); - return -EINVAL; } else { - BUG(); + btrfs_print_leaf(eb); + btrfs_err(rc->block_group->fs_info, + "unrecognized tree backref at tree block %llu slot %u", + eb->start, path->slots[0]); + btrfs_release_path(path); + return -EUCLEAN; } btrfs_release_path(path); @@ -3270,7 +3308,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.objectid = rc->extent_root->fs_info->nodesize; block->key.offset = generation; block->level = level; - block->key_ready = 0; + block->key_ready = false; block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); @@ -3436,11 +3474,10 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, /* * helper to find all tree blocks that reference a given data extent */ -static noinline_for_stack -int add_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct btrfs_path *path, - struct rb_root *blocks) +static noinline_for_stack int add_data_references(struct reloc_control *rc, + const struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) { struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; @@ -3498,6 +3535,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, last = rc->block_group->start + rc->block_group->length; while (1) { + bool block_found; + cond_resched(); if (rc->search_start >= last) { ret = 1; @@ -3548,11 +3587,11 @@ next: goto next; } - ret = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); - if (ret == 0 && start <= key.objectid) { + if (block_found && start <= key.objectid) { btrfs_release_path(path); rc->search_start = end + 1; } else { @@ -3612,7 +3651,7 @@ int prepare_to_relocate(struct reloc_control *rc) if (ret) return ret; - rc->create_reloc_tree = 1; + rc->create_reloc_tree = true; set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); @@ -3692,6 +3731,21 @@ restart: struct btrfs_extent_item); flags = btrfs_extent_flags(path->nodes[0], ei); + /* + * If we are relocating a simple quota owned extent item, we + * need to note the owner on the reloc data root so that when + * we allocate the replacement item, we can attribute it to the + * correct eventual owner (rather than the reloc data root). + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; + u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, + path->nodes[0], + path->slots[0]); + + root->relocation_src_root = owning_root_id; + } + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && @@ -3724,7 +3778,7 @@ restart: if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { - rc->found_file_extent = 1; + rc->found_file_extent = true; ret = relocate_data_extent(rc->data_inode, &key, &rc->cluster); if (ret < 0) { @@ -3761,7 +3815,7 @@ restart: err = ret; } - rc->create_reloc_tree = 0; + rc->create_reloc_tree = false; set_reloc_control(rc); btrfs_backref_release_cache(&rc->backref_cache); @@ -3779,7 +3833,7 @@ restart: merge_reloc_roots(rc); - rc->merge_reloc_tree = 0; + rc->merge_reloc_tree = false; unset_reloc_control(rc); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); @@ -3825,7 +3879,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -3864,9 +3918,9 @@ out: * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ -static noinline_for_stack -struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *group) +static noinline_for_stack struct inode *create_reloc_inode( + struct btrfs_fs_info *fs_info, + const struct btrfs_block_group *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -3961,8 +4015,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); INIT_LIST_HEAD(&rc->dirty_subvol_roots); - btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); - mapping_tree_init(&rc->reloc_root_tree); + btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); + rc->reloc_root_tree.rb_root = RB_ROOT; + spin_lock_init(&rc->reloc_root_tree.lock); extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -3994,7 +4049,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, block_group->start, buf); } -static const char *stage_to_string(int stage) +static const char *stage_to_string(enum reloc_stage stage) { if (stage == MOVE_DATA_EXTENTS) return "move data extents"; @@ -4110,7 +4165,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) WARN_ON(ret && ret != -EAGAIN); while (1) { - int finishes_stage; + enum reloc_stage finishes_stage; mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); @@ -4293,7 +4348,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) goto out_unset; } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, @@ -4412,7 +4467,8 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, + struct btrfs_root *root, + const struct extent_buffer *buf, struct extent_buffer *cow) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4551,7 +4607,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, * * Return U64_MAX if no running relocation. */ -u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info) +u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) { u64 logical = U64_MAX; diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 77d69f6ae967..5fb60f2deb53 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -10,15 +10,16 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, + struct btrfs_root *root, + const struct extent_buffer *buf, struct extent_buffer *cow); void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_should_ignore_reloc_root(struct btrfs_root *root); -u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info); +bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root); +u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 859874579456..603ad1459368 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -51,7 +51,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, } /* - * btrfs_find_root - lookup the root by the key. + * Lookup the root by the key. + * * root: the root of the root tree * search_key: the key to search * path: the path we search @@ -191,7 +192,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -438,7 +439,7 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); @@ -485,7 +486,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, } /* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * Reserve space for subvolume operation. + * * root: the root of the parent directory * rsv: block reservation * items: the number of items that we need do reservation @@ -508,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + if (btrfs_qgroup_enabled(fs_info)) { /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index cbbaca32126e..8b2c3859e464 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +struct fscrypt_str; + int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -18,10 +20,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item); int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7289f5bff397..f62a408671cb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,7 +16,6 @@ #include "backref.h" #include "extent_io.h" #include "dev-replace.h" -#include "check-integrity.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" @@ -24,6 +23,7 @@ #include "accessors.h" #include "file-item.h" #include "scrub.h" +#include "raid-stripe-tree.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -43,9 +43,20 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This determines the batch size for stripe submitted in one go. + * This detemines how many stripes would be submitted in one go, + * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ -#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */ +#define SCRUB_STRIPES_PER_GROUP 8 + +/* + * How many groups we have for each sctx. + * + * This would be 8M per device, the same value as the old scrub in-flight bios + * size limit. + */ +#define SCRUB_GROUPS_PER_SCTX 16 + +#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) /* * The following value times PAGE_SIZE needs to be large enough to match the @@ -172,9 +183,11 @@ struct scrub_stripe { }; struct scrub_ctx { - struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; + struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; + struct btrfs_path extent_path; + struct btrfs_path csum_path; int first_free; int cur_stripe; atomic_t cancel_req; @@ -315,10 +328,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) release_scrub_stripe(&sctx->stripes[i]); - kfree(sctx); + kvfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) @@ -333,13 +346,20 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( struct scrub_ctx *sctx; int i; - sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); + /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use + * kvzalloc(). + */ + sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { + sctx->extent_path.search_commit_root = 1; + sctx->extent_path.skip_locking = 1; + sctx->csum_path.search_commit_root = 1; + sctx->csum_path.skip_locking = 1; + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); @@ -877,7 +897,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, ASSERT(stripe->mirror_num >= 1); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, - NULL, NULL, 1); + NULL, NULL); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -970,6 +990,9 @@ skip: spin_unlock(&sctx->stat_lock); } +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace); + /* * The main entrance for all read related scrub work, including: * @@ -978,13 +1001,16 @@ skip: * - Go through the remaining mirrors and try to read as large blocksize as * possible * - Go through all mirrors (including the failed mirror) sector-by-sector + * - Submit writeback for repaired sectors * - * Writeback does not happen here, it needs extra synchronization. + * Writeback for dev-replace does not happen here, it needs extra + * synchronization for zoned devices. */ static void scrub_stripe_read_repair_worker(struct work_struct *work) { struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct scrub_ctx *sctx = stripe->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); int mirror; @@ -1049,7 +1075,23 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) goto out; } out: - scrub_stripe_report_errors(stripe->sctx, stripe); + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); + } else if (!sctx->readonly) { + unsigned long repaired; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + wait_scrub_stripe_io(stripe); + } + + scrub_stripe_report_errors(sctx, stripe); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1262,7 +1304,6 @@ static int get_raid56_logic_offset(u64 physical, int num, /* Work out the disk rotation on this stripe-set */ rot = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -1468,6 +1509,8 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) * Return <0 for error. */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_path *extent_path, + struct btrfs_path *csum_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, @@ -1477,7 +1520,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); const u64 logical_end = logical_start + logical_len; - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; u64 stripe_end; u64 extent_start; @@ -1493,14 +1535,13 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* The range must be inside the bg. */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; - - ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); + ret = find_first_extent_item(extent_root, extent_path, logical_start, + logical_len); /* Either error or not found. */ if (ret) goto out; - get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, + &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) @@ -1528,7 +1569,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* Fill the extent info for the remaining sectors. */ while (cur_logical <= stripe_end) { - ret = find_first_extent_item(extent_root, &path, cur_logical, + ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) goto out; @@ -1536,7 +1577,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ret = 0; break; } - get_extent_info(&path, &extent_start, &extent_len, + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; @@ -1561,9 +1602,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, */ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, - stripe_end, stripe->csums, - &csum_bitmap, true); + ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, + stripe->logical, stripe_end, + stripe->csums, &csum_bitmap); if (ret < 0) goto out; if (ret > 0) @@ -1576,7 +1617,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); out: - btrfs_release_path(&path); return ret; } @@ -1595,6 +1635,71 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) } } +static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + u64 stripe_len = BTRFS_STRIPE_LEN; + int mirror = stripe->mirror_num; + int i; + + atomic_inc(&stripe->pending_io); + + for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + struct page *page = scrub_stripe_get_page(stripe, i); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + + /* The current sector cannot be merged, submit the bio. */ + if (bbio && + ((i > 0 && + !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + bbio->bio.bi_iter.bi_size >= stripe_len)) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + bbio = NULL; + } + + if (!bbio) { + struct btrfs_io_stripe io_stripe = {}; + struct btrfs_io_context *bioc = NULL; + const u64 logical = stripe->logical + + (i << fs_info->sectorsize_bits); + int err; + + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + + io_stripe.is_scrub = true; + err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &stripe_len, &bioc, &io_stripe, + &mirror); + btrfs_put_bioc(bioc); + if (err) { + btrfs_bio_end_io(bbio, + errno_to_blk_status(err)); + return; + } + } + + __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + } + + if (bbio) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + } + + if (atomic_dec_and_test(&stripe->pending_io)) { + wake_up(&stripe->io_wait); + INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); + queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); + } +} + static void scrub_submit_initial_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { @@ -1606,6 +1711,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(stripe->mirror_num > 0); ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { + scrub_submit_extent_sector_read(sctx, stripe); + return; + } + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); @@ -1654,6 +1764,28 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) return false; } +static void submit_initial_group_read(struct scrub_ctx *sctx, + unsigned int first_slot, + unsigned int nr_stripes) +{ + struct blk_plug plug; + + ASSERT(first_slot < SCRUB_TOTAL_STRIPES); + ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); + + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + btrfs_stripe_nr_to_offset(nr_stripes)); + blk_start_plug(&plug); + for (int i = 0; i < nr_stripes; i++) { + struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; + + /* Those stripes should be initialized. */ + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + scrub_submit_initial_read(sctx, stripe); + } + blk_finish_plug(&plug); +} + static int flush_scrub_stripes(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -1666,11 +1798,11 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); - scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - btrfs_stripe_nr_to_offset(nr_stripes)); - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - scrub_submit_initial_read(sctx, stripe); + /* Submit the stripes which are populated but not submitted. */ + if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { + const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); + + submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); } for (int i = 0; i < nr_stripes; i++) { @@ -1680,32 +1812,6 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); } - /* - * Submit the repaired sectors. For zoned case, we cannot do repair - * in-place, but queue the bg to be relocated. - */ - if (btrfs_is_zoned(fs_info)) { - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - - if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { - btrfs_repair_one_zone(fs_info, - sctx->stripes[0].bg->start); - break; - } - } - } else if (!sctx->readonly) { - for (int i = 0; i < nr_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - } - /* Submit for dev-replace. */ if (sctx->is_dev_replace) { /* @@ -1750,28 +1856,42 @@ static void raid56_scrub_wait_endio(struct bio *bio) static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_device *dev, int mirror_num, - u64 logical, u32 length, u64 physical) + u64 logical, u32 length, u64 physical, + u64 *found_logical_ret) { struct scrub_stripe *stripe; int ret; - /* No available slot, submit all stripes and wait for them. */ - if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) { - ret = flush_scrub_stripes(sctx); - if (ret < 0) - return ret; - } + /* + * There should always be one slot left, as caller filling the last + * slot should flush them all. + */ + ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); - stripe = &sctx->stripes[sctx->cur_stripe]; + /* @found_logical_ret must be specified. */ + ASSERT(found_logical_ret); - /* We can queue one stripe using the remaining slot. */ + stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); - ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, - logical, length, stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, + &sctx->csum_path, dev, physical, + mirror_num, logical, length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; + *found_logical_ret = stripe->logical; sctx->cur_stripe++; + + /* We filled one group, submit it. */ + if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { + const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; + + submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); + } + + /* Last slot used, flush them all. */ + if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) + return flush_scrub_stripes(sctx); return 0; } @@ -1785,6 +1905,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; + struct btrfs_path extent_path = { 0 }; + struct btrfs_path csum_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; @@ -1795,6 +1917,16 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); + /* + * For data stripe search, we cannot re-use the same extent/csum paths, + * as the data stripe bytenr may be smaller than previous extent. Thus + * we have to use our own extent/csum paths. + */ + extent_path.search_commit_root = 1; + extent_path.skip_locking = 1; + csum_path.search_commit_root = 1; + csum_path.skip_locking = 1; + for (int i = 0; i < data_stripes; i++) { int stripe_index; int rot; @@ -1809,7 +1941,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); - ret = scrub_find_fill_first_stripe(bg, + ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); @@ -1854,24 +1986,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* For now, no zoned support for RAID56. */ ASSERT(!btrfs_is_zoned(sctx->fs_info)); - /* Writeback for the repaired sectors. */ - for (int i = 0; i < data_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->raid56_data_stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - - /* Wait for the above writebacks to finish. */ - for (int i = 0; i < data_stripes; i++) { - stripe = &sctx->raid56_data_stripes[i]; - - wait_scrub_stripe_io(stripe); - } - /* * Now all data stripes are properly verified. Check if we have any * unrepaired, if so abort immediately or we could further corrupt the @@ -1910,7 +2024,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc, NULL, NULL, 1); + &length, &bioc, NULL, NULL); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1937,6 +2051,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio_put(bio); btrfs_bio_counter_dec(fs_info); + btrfs_release_path(&extent_path); + btrfs_release_path(&csum_path); out: return ret; } @@ -1958,18 +2074,15 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; - /* An artificial limit, inherit from old scrub behavior */ - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; int ret; /* The range must be inside the bg */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { + u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ @@ -1994,7 +2107,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, ret = queue_scrub_stripe(sctx, bg, device, mirror_num, cur_logical, logical_end - cur_logical, - cur_physical); + cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ sctx->stat.last_physical = physical + logical_length; @@ -2004,14 +2117,13 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (ret < 0) break; - ASSERT(sctx->cur_stripe > 0); - cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical - + BTRFS_STRIPE_LEN; + /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ + ASSERT(found_logical != U64_MAX); + cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ cond_resched(); } - btrfs_release_path(&path); return ret; } @@ -2109,6 +2221,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 stripe_logical; int stop_loop = 0; + /* Extent_path should be released by now. */ + ASSERT(sctx->extent_path.nodes[0] == NULL); + scrub_blocked_if_needed(fs_info); if (sctx->is_dev_replace && @@ -2227,6 +2342,9 @@ out: ret2 = flush_scrub_stripes(sctx); if (!ret) ret = ret2; + btrfs_release_path(&sctx->extent_path); + btrfs_release_path(&sctx->csum_path); + if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) release_scrub_stripe(&sctx->raid56_data_stripes[i]); @@ -2673,7 +2791,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; else - gen = fs_info->last_trans_committed; + gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); @@ -2711,8 +2829,7 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, - int is_dev_replace) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { struct workqueue_struct *scrub_workers = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; @@ -2722,10 +2839,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - if (is_dev_replace) - scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); - else - scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) return -ENOMEM; @@ -2777,7 +2891,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); - ret = scrub_workers_get(fs_info, is_dev_replace); + ret = scrub_workers_get(fs_info); if (ret) goto out_free_ctx; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8bfd44750efe..4e36550618e5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -796,7 +796,7 @@ static int send_cmd(struct send_ctx *sctx) put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); put_unaligned_le32(0, &hdr->crc); - crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -3685,7 +3685,7 @@ static void tail_append_pending_moves(struct send_ctx *sctx, static int apply_children_dir_moves(struct send_ctx *sctx) { struct pending_dir_move *pm; - struct list_head stack; + LIST_HEAD(stack); u64 parent_ino = sctx->cur_ino; int ret = 0; @@ -3693,7 +3693,6 @@ static int apply_children_dir_moves(struct send_ctx *sctx) if (!pm) return 0; - INIT_LIST_HEAD(&stack); tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { @@ -4165,7 +4164,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; - struct list_head check_dirs; + LIST_HEAD(check_dirs); struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; @@ -4184,7 +4183,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * which is always '..' */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); - INIT_LIST_HEAD(&check_dirs); valid_path = fs_path_alloc(); if (!valid_path) { @@ -5671,8 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, hdr = (struct btrfs_cmd_header *)sctx->send_buf; hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); hdr->crc = 0; - crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); - crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + crc = crc32c(0, sctx->send_buf, sctx->send_size); + crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -8160,7 +8158,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } sctx->send_filp = fget(arg->send_fd); - if (!sctx->send_filp) { + if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 75e7fa337e66..571bb13587d5 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -345,8 +345,10 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { + struct btrfs_space_info *data_sinfo; u64 profile; u64 avail; + u64 data_chunk_size; int factor; if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) @@ -364,6 +366,36 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, */ factor = btrfs_bg_type_to_factor(profile); avail = div_u64(avail, factor); + if (avail == 0) + return 0; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no more + * than 10% of the disk or 1G, whichever is smaller. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + + /* + * Since data allocations immediately use block groups as part of the + * reservation, because we assume that data reservations will == actual + * usage, we could potentially overcommit and then immediately have that + * available space used by a data allocation, which could put us in a + * bind when we get close to filling the file system. + * + * To handle this simply remove the data_chunk_size from the available + * space. If we are relatively empty this won't affect our ability to + * overcommit much, and if we're very close to full it'll keep us from + * getting into a position where we've given ourselves very little + * metadata wiggle room. + */ + if (avail <= data_chunk_size) + return 0; + avail -= data_chunk_size; /* * If we aren't flushing all things, let us overcommit up to @@ -389,11 +421,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && - (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) - avail = 0; - else - avail = calc_available_free_space(fs_info, space_info, flush); + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; @@ -510,6 +538,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, int dump_block_groups) { struct btrfs_block_group *cache; + u64 total_avail = 0; int index = 0; spin_lock(&info->lock); @@ -523,18 +552,27 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, down_read(&info->groups_sem); again: list_for_each_entry(cache, &info->block_groups[index], list) { + u64 avail; + spin_lock(&cache->lock); + avail = cache->length - cache->used - cache->pinned - + cache->reserved - cache->delalloc_bytes - + cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", - cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->zone_unusable, - cache->ro ? "[readonly]" : ""); +"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", + cache->start, cache->length, cache->used, cache->pinned, + cache->reserved, cache->delalloc_bytes, + cache->bytes_super, cache->zone_unusable, + avail, cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); + total_avail += avail; } if (++index < BTRFS_NR_RAID_TYPES) goto again; up_read(&info->groups_sem); + + btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); } static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, @@ -550,18 +588,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, return nr; } -static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); - u64 nr; - - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - #define EXTENT_SIZE_PER_ITEM SZ_256K /* @@ -715,9 +741,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, else nr = -1; - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_run_delayed_items_nr(trans, nr); @@ -733,32 +761,21 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_delayed_refs_nr(fs_info, num_bytes); + btrfs_run_delayed_refs(trans, num_bytes); else - nr = 0; - btrfs_run_delayed_refs(trans, nr); + btrfs_run_delayed_refs(trans, 0); btrfs_end_transaction(trans); break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: - /* - * For metadata space on zoned filesystem, reaching here means we - * don't have enough space left in active_total_bytes. Try to - * activate a block group first, because we may have inactive - * block group already allocated. - */ - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); - if (ret < 0) - break; - else if (ret == 1) - break; - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -770,22 +787,6 @@ static void flush_space(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); - /* - * For metadata space on zoned filesystem, allocating a new chunk - * is not enough. We still need to activate the block * group. - * Active the newly allocated block group by (maybe) finishing - * a block group. - */ - if (ret == 1) { - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); - /* - * Revert to the original ret regardless we could finish - * one block group or not. - */ - if (ret >= 0) - ret = 1; - } - if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -800,9 +801,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case COMMIT_TRANS: ASSERT(current->journal_info == NULL); - trans = btrfs_join_transaction(root); + /* + * We don't want to start a new transaction, just attach to the + * current one or wait it fully commits in case its commit is + * happening at the moment. Note: we don't use a nostart join + * because that does not wait for a transaction to fully commit + * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). + */ + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_commit_transaction(trans); @@ -987,7 +997,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, } /* - * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * We've exhausted our flushing, start failing tickets. + * * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * @@ -1408,8 +1419,18 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } } - /* Attempt to steal from the global rsv if we can. */ - if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + /* + * Attempt to steal from the global rsv if we can, except if the fs was + * turned into error mode due to a transaction abort when flushing space + * above, in that case fail with the abort error instead of returning + * success to the caller if we can steal from the global rsv - this is + * just to have caller fail immeditelly instead of later when trying to + * modify the fs, making it easier to debug -ENOSPC problems. + */ + if (BTRFS_FS_ERROR(fs_info)) { + ticket->error = BTRFS_FS_ERROR(fs_info); + remove_ticket(space_info, ticket); + } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { ticket->error = -ENOSPC; remove_ticket(space_info, ticket); } @@ -1741,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem - * @block_rsv: block_rsv we're allocating for + * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation * @@ -1753,21 +1774,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * space already. */ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; - ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); + ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); + space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); } return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 0bb9d14e60a8..92c595fed1b0 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,6 +3,7 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include <trace/events/btrfs.h> #include "volumes.h" /* @@ -212,7 +213,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f1dd172d8d5b..ef256b944c72 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> +#include <linux/security.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -79,7 +80,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data); static void btrfs_put_super(struct super_block *sb) { - close_ctree(btrfs_sb(sb)); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid); + close_ctree(fs_info); } enum { @@ -129,9 +133,6 @@ enum { Opt_inode_cache, Opt_noinode_cache, /* Debugging options */ - Opt_check_integrity, - Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_enospc_debug, Opt_noenospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, @@ -200,9 +201,6 @@ static const match_table_t tokens = { {Opt_recovery, "recovery"}, /* Debugging options */ - {Opt_check_integrity, "check_int"}, - {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%u"}, {Opt_enospc_debug, "enospc_debug"}, {Opt_noenospc_debug, "noenospc_debug"}, #ifdef CONFIG_BTRFS_DEBUG @@ -707,38 +705,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_skip_balance: btrfs_set_opt(info->mount_opt, SKIP_BALANCE); break; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - case Opt_check_integrity_including_extent_data: - btrfs_info(info, - "enabling check integrity including extent data"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity: - btrfs_info(info, "enabling check integrity"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity_print_mask: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, - "unrecognized check_integrity_print_mask value %s", - args[0].from); - goto out; - } - info->check_integrity_print_mask = intarg; - btrfs_info(info, "check_integrity_print_mask 0x%x", - info->check_integrity_print_mask); - break; -#else - case Opt_check_integrity_including_extent_data: - case Opt_check_integrity: - case Opt_check_integrity_print_mask: - btrfs_err(info, - "support for check_integrity* not compiled in!"); - ret = -EINVAL; - goto out; -#endif case Opt_fatal_errors: if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, @@ -883,7 +849,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags) error = -ENOMEM; goto out; } - device = btrfs_scan_one_device(device_name, flags); + device = btrfs_scan_one_device(device_name, flags, false); kfree(device_name); if (IS_ERR(device)) { error = PTR_ERR(device); @@ -1299,15 +1265,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) - seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(info, CHECK_INTEGRITY)) - seq_puts(seq, ",check_int"); - if (info->check_integrity_print_mask) - seq_printf(seq, ",check_int_print_mask=%d", - info->check_integrity_print_mask); -#endif if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) @@ -1478,7 +1435,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_fs_info; } - device = btrfs_scan_one_device(device_name, mode); + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(device_name, mode, true); + ASSERT(device != NULL); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); error = PTR_ERR(device); @@ -1513,7 +1475,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data); @@ -2111,7 +2073,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && - total_free_meta - thresh < block_rsv->size) + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; @@ -2190,7 +2152,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2204,8 +2170,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); - if (IS_ERR(device)) { + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR(device); break; @@ -2250,6 +2220,7 @@ static int check_dev_super(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_super_block *sb; + u64 last_trans; u16 csum_type; int ret = 0; @@ -2285,10 +2256,10 @@ static int check_dev_super(struct btrfs_device *dev) if (ret < 0) goto out; - if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (btrfs_super_generation(sb) != last_trans) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", - btrfs_super_generation(sb), - fs_info->last_trans_committed); + btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; goto out; } @@ -2398,9 +2369,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - ", integrity-checker=on" -#endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" #endif diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 25294e624851..e6b51fb3ddc1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); +BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif #ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); +/* Remove once support for raid stripe tree is feature complete. */ +BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), BTRFS_FEAT_ATTR_PTR(block_group_tree), + BTRFS_FEAT_ATTR_PTR(simple_quota), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), + BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -414,6 +419,19 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); +static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); +} +BTRFS_ATTR(static_feature, acl, acl_show); + +static ssize_t temp_fsid_supported_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "0\n"); +} +BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show); + /* * Features which only depend on kernel version. * @@ -421,11 +439,13 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, * btrfs_supported_feature_attrs. */ static struct attribute *btrfs_supported_static_feature_attrs[] = { + BTRFS_ATTR_PTR(static_feature, acl), BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), + BTRFS_ATTR_PTR(static_feature, temp_fsid), NULL }; @@ -1189,10 +1209,19 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%llu\n", fs_info->generation); + return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info)); } BTRFS_ATTR(, generation, btrfs_generation_show); +static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid); +} +BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); + static const char * const btrfs_read_policy_name[] = { "pid" }; static ssize_t btrfs_read_policy_show(struct kobject *kobj, @@ -1295,6 +1324,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), + BTRFS_ATTR_PTR(, temp_fsid), NULL, }; @@ -2079,6 +2109,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, } BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); +static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + ssize_t ret = 0; + + spin_lock(&fs_info->qgroup_lock); + ASSERT(btrfs_qgroup_enabled(fs_info)); + switch (btrfs_qgroup_mode(fs_info)) { + case BTRFS_QGROUP_MODE_FULL: + ret = sysfs_emit(buf, "qgroup\n"); + break; + case BTRFS_QGROUP_MODE_SIMPLE: + ret = sysfs_emit(buf, "squota\n"); + break; + default: + btrfs_warn(fs_info, "unexpected qgroup mode %d\n", + btrfs_qgroup_mode(fs_info)); + break; + } + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} +BTRFS_ATTR(qgroups, mode, qgroup_mode_show); + static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) @@ -2141,6 +2198,7 @@ static struct attribute *qgroups_attrs[] = { BTRFS_ATTR_PTR(qgroups, enabled), BTRFS_ATTR_PTR(qgroups, inconsistent), BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + BTRFS_ATTR_PTR(qgroups, mode), NULL }; ATTRIBUTE_GROUPS(qgroups); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 5ef0b90e25c3..6a43a64ba55a 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -61,7 +61,11 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - btrfs_setup_item_for_insert(root, path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, path, &key, value_len); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f6bc6d738555..1cc86af97dc6 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -319,86 +319,139 @@ out: return ret; } -static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i; - for (i = 0; i < len * BITS_PER_BYTE; i++) { + for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { int bit, bit1; bit = !!test_bit(i, bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { - test_err("bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte 0 bit %lu, byte %lu has 0x%02x expect 0x%02x", + i, i / BITS_PER_BYTE, has, expect); return -EINVAL; } bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, i % BITS_PER_BYTE); if (bit1 != bit) { - test_err("offset bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte %lu bit %lu, byte %lu has 0x%02x expect 0x%02x", + i / BITS_PER_BYTE, i % BITS_PER_BYTE, + i / BITS_PER_BYTE, has, expect); return -EINVAL; } } return 0; } -static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int test_bitmap_set(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_set(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_set(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} + +static int test_bitmap_clear(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_clear(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_clear(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i, j; + unsigned long byte_len = eb->len; u32 x; int ret; - memset(bitmap, 0, len); - memzero_extent_buffer(eb, 0, len); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_err("bitmap was not zeroed"); - return -EINVAL; - } + ret = test_bitmap_clear("clear all run 1", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting all bits failed"); + ret = test_bitmap_set("set all", bitmap, eb, 0, 0, byte_len * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing all bits failed"); + ret = test_bitmap_clear("clear all run 2", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("same byte set", bitmap, eb, 0, 2, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("same byte partial clear", bitmap, eb, 0, 4, 1); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross byte set", bitmap, eb, 2, 4, 8); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross multi byte set", bitmap, eb, 4, 4, 24); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross byte clear", bitmap, eb, 2, 6, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross multi byte clear", bitmap, eb, 4, 6, 20); + if (ret < 0) return ret; - } /* Straddling pages test */ - if (len > PAGE_SIZE) { - bitmap_set(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting straddling pages failed"); + if (byte_len > PAGE_SIZE) { + ret = test_bitmap_set("cross page set", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross page set all", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + ret = test_bitmap_clear("cross page clear", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing straddling pages failed"); + if (ret < 0) return ret; - } } /* @@ -406,9 +459,12 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, * something repetitive that could miss some hypothetical off-by-n bug. */ x = 0; - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - for (i = 0; i < len * BITS_PER_BYTE / 32; i++) { + ret = test_bitmap_clear("clear all run 3", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + for (i = 0; i < byte_len * BITS_PER_BYTE / 32; i++) { x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; for (j = 0; j < 32; j++) { if (x & (1U << j)) { @@ -418,7 +474,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, } } - ret = check_eb_bitmap(bitmap, eb, len); + ret = check_eb_bitmap(bitmap, eb); if (ret) { test_err("random bit pattern failed"); return ret; @@ -456,7 +512,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); if (ret) goto out; @@ -473,7 +529,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); out: free_extent_buffer(eb); kfree(bitmap); @@ -592,6 +648,146 @@ out: return ret; } +static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < eb->len; i++) { + struct page *page = eb->pages[i >> PAGE_SHIFT]; + void *addr = page_address(page) + offset_in_page(i); + + if (memcmp(addr, memory + i, 1) != 0) { + test_err("%s failed", test_name); + test_err("eb and memory diffs at byte %u, eb has 0x%02x memory has 0x%02x", + i, *(u8 *)addr, *(u8 *)(memory + i)); + return; + } + } +} + +static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { + void *eb_addr = page_address(eb->pages[i]); + + if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { + dump_eb_and_memory_contents(eb, memory, test_name); + return -EUCLEAN; + } + } + return 0; +} + +/* + * Init both memory and extent buffer contents to the same randomly generated + * contents. + */ +static void init_eb_and_memory(struct extent_buffer *eb, void *memory) +{ + get_random_bytes(memory, eb->len); + write_extent_buffer(eb, memory, 0, eb->len); +} + +static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct extent_buffer *eb = NULL; + void *memory = NULL; + int ret; + + test_msg("running extent buffer memory operation tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + memory = kvzalloc(nodesize, GFP_KERNEL); + if (!memory) { + test_err("failed to allocate memory"); + ret = -ENOMEM; + goto out; + } + + eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + if (!eb) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = -ENOMEM; + goto out; + } + + init_eb_and_memory(eb, memory); + ret = verify_eb_and_memory(eb, memory, "full eb write"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 16, 16); + memcpy_extent_buffer(eb, 0, 16, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 2048, 16); + memcpy_extent_buffer(eb, 0, 2048, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + memcpy(memory, memory + 2048, 2048); + memcpy_extent_buffer(eb, 0, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 3"); + if (ret < 0) + goto out; + + memmove(memory + 512, memory + 256, 512); + memmove_extent_buffer(eb, 512, 256, 512); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 2048, memory + 512, 2048); + memmove_extent_buffer(eb, 2048, 512, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 2"); + if (ret < 0) + goto out; + memmove(memory + 512, memory + 2048, 2048); + memmove_extent_buffer(eb, 512, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 3"); + if (ret < 0) + goto out; + + if (nodesize > PAGE_SIZE) { + memcpy(memory, memory + 4096 - 128, 256); + memcpy_extent_buffer(eb, 0, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory + 4096 - 128, memory + 4096 + 128, 256); + memcpy_extent_buffer(eb, 4096 - 128, 4096 + 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 128, memory + 4096 - 64, 256); + memmove_extent_buffer(eb, 4096 - 128, 4096 - 64, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 64, memory + 4096 - 128, 256); + memmove_extent_buffer(eb, 4096 - 64, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 2"); + if (ret < 0) + goto out; + } +out: + free_extent_buffer(eb); + kvfree(memory); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; @@ -607,6 +803,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) goto out; ret = test_eb_bitmaps(sectorsize, nodesize); + if (ret) + goto out; + + ret = test_eb_mem_ops(sectorsize, nodesize); out: return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index ed0f36ae5346..29bdd08b241f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../btrfs_inode.h" #include "../volumes.h" #include "../disk-io.h" #include "../block-group.h" @@ -442,6 +443,406 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } +static int add_compressed_extent(struct extent_map_tree *em_tree, + u64 start, u64 len, u64 block_start) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = start; + em->len = len; + em->block_start = block_start; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("cannot add extent map [%llu, %llu)", start, start + len); + return ret; + } + + return 0; +} + +struct extent_range { + u64 start; + u64 len; +}; + +/* The valid states of the tree after every drop, as described below. */ +struct extent_range valid_ranges[][7] = { + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 3, .len = SZ_4K * 3}, /* [12k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + { .start = SZ_32K, .len = SZ_4K}, /* [32k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K}, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + } +}; + +static int validate_range(struct extent_map_tree *em_tree, int index) +{ + struct rb_node *n; + int i; + + for (i = 0, n = rb_first_cached(&em_tree->map); + valid_ranges[index][i].len && n; + i++, n = rb_next(n)) { + struct extent_map *entry = rb_entry(n, struct extent_map, rb_node); + + if (entry->start != valid_ranges[index][i].start) { + test_err("mapping has start %llu expected %llu", + entry->start, valid_ranges[index][i].start); + return -EINVAL; + } + + if (entry->len != valid_ranges[index][i].len) { + test_err("mapping has len %llu expected %llu", + entry->len, valid_ranges[index][i].len); + return -EINVAL; + } + } + + /* + * We exited because we don't have any more entries in the extent_map + * but we still expect more valid entries. + */ + if (valid_ranges[index][i].len) { + test_err("missing an entry"); + return -EINVAL; + } + + /* We exited the loop but still have entries in the extent map. */ + if (n) { + test_err("we have a left over entry in the extent map we didn't expect"); + return -EINVAL; + } + + return 0; +} + +/* + * Test scenario: + * + * Test the various edge cases of btrfs_drop_extent_map_range, create the + * following ranges + * + * [0, 12k)[12k, 24k)[24k, 36k)[36k, 40k)[40k,64k) + * + * And then we'll drop: + * + * [8k, 12k) - test the single front split + * [12k, 20k) - test the single back split + * [28k, 32k) - test the double split + * [32k, 64k) - test whole em dropping + * + * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from + * merging the em's. + */ +static int test_case_5(void) +{ + struct extent_map_tree *em_tree; + struct inode *inode; + u64 start, end; + int ret; + + test_msg("Running btrfs_drop_extent_map_range tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + /* [0, 12k) */ + ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + if (ret) { + test_err("cannot add extent range [0, 12K)"); + goto out; + } + + /* [12k, 24k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [24k, 36k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [36k, 40k) */ + ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [40k, 64k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* Drop [8k, 12k) */ + start = SZ_8K; + end = (3 * SZ_4K) - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + if (ret) + goto out; + + /* Drop [12k, 20k) */ + start = SZ_4K * 3; + end = SZ_16K + SZ_4K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + if (ret) + goto out; + + /* Drop [28k, 32k) */ + start = SZ_32K - SZ_4K; + end = SZ_32K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + if (ret) + goto out; + + /* Drop [32k, 64k) */ + start = SZ_32K; + end = SZ_64K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + if (ret) + goto out; +out: + iput(inode); + return ret; +} + +/* + * Test the btrfs_add_extent_mapping helper which will attempt to create an em + * for areas between two existing ems. Validate it doesn't do this when there + * are two unmerged em's side by side. + */ +static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +{ + struct extent_map *em = NULL; + int ret; + + ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + if (ret) + goto out; + + ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + if (ret) + goto out; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_16K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + write_unlock(&em_tree->lock); + + if (ret != 0) { + test_err("got an error when adding our em: %d", ret); + goto out; + } + + ret = -EINVAL; + if (em->start != 0) { + test_err("unexpected em->start at %llu, wanted 0", em->start); + goto out; + } + if (em->len != SZ_4K) { + test_err("unexpected em->len %llu, expected 4K", em->len); + goto out; + } + ret = 0; +out: + free_extent_map(em); + free_extent_map_tree(em_tree); + return ret; +} + +/* + * Regression test for btrfs_drop_extent_map_range. Calling with skip_pinned == + * true would mess up the start/end calculations and subsequent splits would be + * incorrect. + */ +static int test_case_7(void) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct inode *inode; + int ret; + + test_msg("Running btrfs_drop_extent_cache with pinned"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [0, 16K), pinned */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [32K, 48K), not pinned */ + em->start = SZ_32K; + em->len = SZ_16K; + em->block_start = SZ_32K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + /* + * Drop [0, 36K) This should skip the [0, 4K) extent and then split the + * [32K, 48K) extent. + */ + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + + /* Make sure our extent maps look sane. */ + ret = -EINVAL; + + em = lookup_extent_mapping(em_tree, 0, SZ_16K); + if (!em) { + test_err("didn't find an em at 0 as expected"); + goto out; + } + + if (em->start != 0) { + test_err("em->start is %llu, expected 0", em->start); + goto out; + } + + if (em->len != SZ_16K) { + test_err("em->len is %llu, expected 16K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an em when we weren't expecting one"); + goto out; + } + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + read_unlock(&em_tree->lock); + if (!em) { + test_err("didn't find an em at 32K as expected"); + goto out; + } + + if (em->start != (36 * SZ_1K)) { + test_err("em->start is %llu, expected 36K", em->start); + goto out; + } + + if (em->len != (12 * SZ_1K)) { + test_err("em->len is %llu, expected 12K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an unexpected em above 48K"); + goto out; + } + + ret = 0; +out: + free_extent_map(em); + iput(inode); + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -619,6 +1020,17 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_4(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_5(); + if (ret) + goto out; + ret = test_case_6(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_7(); + if (ret) + goto out; test_msg("running rmap tests"); for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) { diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 05b03f5eab83..492d69d2fa73 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -34,7 +34,11 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - btrfs_setup_item_for_insert(root, &path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,7 +68,11 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - btrfs_setup_item_for_insert(root, &path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 91b6c2fdc420..5b3333ceef04 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -37,8 +37,6 @@ static struct kmem_cache *btrfs_trans_handle_cachep; -#define BTRFS_ROOT_TRANS_TAG 0 - /* * Transaction states and transitions * @@ -56,12 +54,17 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | Call btrfs_commit_transaction() on any trans handle attached to * | transaction N * V - * Transaction N [[TRANS_STATE_COMMIT_START]] + * Transaction N [[TRANS_STATE_COMMIT_PREP]] + * | + * | If there are simultaneous calls to btrfs_commit_transaction() one will win + * | the race and the rest will wait for the winner to commit the transaction. * | - * | Will wait for previous running transaction to completely finish if there - * | is one + * | The winner will wait for previous running transaction to completely finish + * | if there is one. * | - * | Then one of the following happes: + * Transaction N [[TRANS_STATE_COMMIT_START]] + * | + * | Then one of the following happens: * | - Wait for all other trans handle holders to release. * | The btrfs_commit_transaction() caller will do the commit work. * | - Wait for current transaction to be committed by others. @@ -112,6 +115,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_COMMIT_PREP] = 0U, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | @@ -292,10 +296,11 @@ loop: spin_unlock(&fs_info->trans_lock); /* - * If we are ATTACH, we just want to catch the current transaction, - * and commit it. If there is no transaction, just return ENOENT. + * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the + * current transaction, and commit it. If there is no transaction, just + * return ENOENT. */ - if (type == TRANS_ATTACH) + if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) return -ENOENT; /* @@ -379,7 +384,7 @@ loop: IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS); - fs_info->generation++; + btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; @@ -554,6 +559,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) return true; } +static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush, + u64 num_bytes, + u64 *delayed_refs_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; + u64 extra_delayed_refs_bytes = 0; + u64 bytes; + int ret; + + /* + * If there's a gap between the size of the delayed refs reserve and + * its reserved space, than some tasks have added delayed refs or bumped + * its size otherwise (due to block group creation or removal, or block + * group item update). Also try to allocate that gap in order to prevent + * using (and possibly abusing) the global reserve when committing the + * transaction. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL && + !btrfs_block_rsv_full(delayed_refs_rsv)) { + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) + extra_delayed_refs_bytes = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + } + + bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes; + + /* + * We want to reserve all the bytes we may need all at once, so we only + * do 1 enospc flushing cycle per transaction start. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + if (ret == 0) { + if (extra_delayed_refs_bytes > 0) + btrfs_migrate_to_delayed_refs_rsv(fs_info, + extra_delayed_refs_bytes); + return 0; + } + + if (extra_delayed_refs_bytes > 0) { + bytes -= extra_delayed_refs_bytes; + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + if (ret == 0) + return 0; + } + + /* + * If we are an emergency flush, which can steal from the global block + * reserve, then attempt to not reserve space for the delayed refs, as + * we will consume space for them from the global block reserve. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + bytes -= *delayed_refs_bytes; + *delayed_refs_bytes = 0; + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + } + + return ret; +} + static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, unsigned int type, enum btrfs_reserve_flush_enum flush, @@ -561,10 +629,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; u64 qgroup_reserved = 0; + u64 delayed_refs_bytes = 0; bool reloc_reserved = false; bool do_chunk_alloc = false; int ret; @@ -587,29 +657,27 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { - struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; - u64 delayed_refs_bytes = 0; - qgroup_reserved = num_items * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, - enforce_qgroups); + /* + * Use prealloc for now, as there might be a currently running + * transaction that could free this reserved space prematurely + * by committing. + */ + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, + enforce_qgroups, false); if (ret) return ERR_PTR(ret); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); /* - * We want to reserve all the bytes we may need all at once, so - * we only do 1 enospc flushing cycle per transaction start. We - * accomplish this by simply assuming we'll do num_items worth - * of delayed refs updates in this trans handle, and refill that - * amount for whatever is missing in the reserve. + * If we plan to insert/update/delete "num_items" from a btree, + * we will also generate delayed refs for extent buffers in the + * respective btree paths, so reserve space for the delayed refs + * that will be generated by the caller as it modifies btrees. + * Try to reserve them to avoid excessive use of the global + * block reserve. */ - num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); - if (flush == BTRFS_RESERVE_FLUSH_ALL && - !btrfs_block_rsv_full(delayed_refs_rsv)) { - delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - num_items); - num_bytes += delayed_refs_bytes; - } + delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items); /* * Do the reservation for the relocation root creation @@ -619,16 +687,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); + ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes, + &delayed_refs_bytes); if (ret) goto reserve_fail; - if (delayed_refs_bytes) { - btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, - delayed_refs_bytes); - num_bytes -= delayed_refs_bytes; - } - if (rsv->space_info->force_alloc) + btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true); + + if (trans_rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !btrfs_block_rsv_full(delayed_refs_rsv)) { @@ -688,6 +754,7 @@ again: h->type = type; INIT_LIST_HEAD(&h->new_bgs); + btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS); smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START && @@ -700,11 +767,28 @@ again: if (num_bytes) { trace_btrfs_space_reservation(fs_info, "transaction", h->transid, num_bytes, 1); - h->block_rsv = &fs_info->trans_block_rsv; + h->block_rsv = trans_rsv; h->bytes_reserved = num_bytes; + if (delayed_refs_bytes > 0) { + trace_btrfs_space_reservation(fs_info, + "local_delayed_refs_rsv", + h->transid, + delayed_refs_bytes, 1); + h->delayed_refs_bytes_reserved = delayed_refs_bytes; + btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true); + delayed_refs_bytes = 0; + } h->reloc_reserved = reloc_reserved; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + got_it: if (!current->journal_info) current->journal_info = h; @@ -749,10 +833,12 @@ join_fail: kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: if (num_bytes) - btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, - num_bytes, NULL); + btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); + if (delayed_refs_bytes) + btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, + delayed_refs_bytes); reserve_fail: - btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); } @@ -785,7 +871,10 @@ struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root * /* * Similar to regular join but it never starts a transaction when none is - * running or after waiting for the current one to finish. + * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. + * This is similar to btrfs_attach_transaction() but it allows the join to + * happen if the transaction commit already started but it's not yet in the + * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). */ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) { @@ -794,7 +883,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo } /* - * btrfs_attach_transaction() - catch the running transaction + * Catch the running transaction. * * It is used when we want to commit the current the transaction, but * don't want to start a new one. @@ -813,7 +902,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) } /* - * btrfs_attach_transaction_barrier() - catch the running transaction + * Catch the running transaction. * * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully @@ -889,7 +978,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) int ret = 0; if (transid) { - if (transid <= fs_info->last_trans_committed) + if (transid <= btrfs_get_last_trans_committed(fs_info)) goto out; /* find specified transaction */ @@ -913,7 +1002,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) * raced with btrfs_commit_transaction */ if (!cur_trans) { - if (transid > fs_info->last_trans_committed) + if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; goto out; } @@ -968,11 +1057,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) if (!trans->block_rsv) { ASSERT(!trans->bytes_reserved); + ASSERT(!trans->delayed_refs_bytes_reserved); return; } - if (!trans->bytes_reserved) + if (!trans->bytes_reserved) { + ASSERT(!trans->delayed_refs_bytes_reserved); return; + } ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", @@ -980,6 +1072,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) btrfs_block_rsv_release(fs_info, trans->block_rsv, trans->bytes_reserved, NULL); trans->bytes_reserved = 0; + + if (!trans->delayed_refs_bytes_reserved) + return; + + trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", + trans->transid, + trans->delayed_refs_bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, &trans->delayed_rsv, + trans->delayed_refs_bytes_reserved, NULL); + trans->delayed_refs_bytes_reserved = 0; } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -1060,8 +1162,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; err = convert_extent_bit(dirty_pages, start, end, @@ -1114,8 +1216,8 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries @@ -1311,7 +1413,7 @@ again: } /* Now flush any delayed refs generated by updating all of the roots */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; @@ -1326,7 +1428,7 @@ again: * so we want to keep this flushing in this loop to make sure * everything gets run. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; } @@ -1461,45 +1563,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) } /* - * defrag a given btree. - * Every leaf in the btree is read and defragged. - */ -int btrfs_defrag_root(struct btrfs_root *root) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_trans_handle *trans; - int ret; - - if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) - return 0; - - while (1) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - - ret = btrfs_defrag_leaves(trans, root); - - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(info); - cond_resched(); - - if (btrfs_fs_closing(info) || ret != -EAGAIN) - break; - - if (btrfs_defrag_cancelled(info)) { - btrfs_debug(info, "defrag_root cancelled"); - ret = -EAGAIN; - break; - } - } - clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); - return ret; -} - -/* * Do all special snapshot related qgroup dirty hack. * * Will do all needed qgroup inherit and dirty hack like switch commit @@ -1516,11 +1579,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, int ret; /* - * Save some performance in the case that qgroups are not - * enabled. If this check races with the ioctl, rescan will - * kick in anyway. + * Save some performance in the case that qgroups are not enabled. If + * this check races with the ioctl, rescan will kick in anyway. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* @@ -1544,7 +1606,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * for now flush the delayed refs to narrow the race window where the * qgroup counters could end up wrong. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -1559,7 +1621,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, /* Now qgroup are all updated, we can inherit it to new qgroups */ ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, - inherit); + parent->root_key.objectid, inherit); if (ret < 0) goto out; @@ -1709,6 +1771,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_release_path(path); + ret = btrfs_create_qgroup(trans, objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + /* * pull in the delayed directory update * and the delayed inode item @@ -1820,8 +1888,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * To co-operate with that hack, we do hack again. * Or snapshot will be greatly slowed down by a subtree qgroup rescan */ - ret = qgroup_account_snapshot(trans, root, parent_root, - pending->inherit, objectid); + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid, + parent_root->root_key.objectid, pending->inherit); if (ret < 0) goto fail; @@ -1837,9 +1909,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + fname.disk_name.len * 2); - parent_inode->i_mtime = current_time(parent_inode); - parent_inode->i_ctime = parent_inode->i_mtime; - ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(parent_inode, + inode_set_ctime_current(parent_inode)); + ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1966,7 +2038,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ - btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -2062,7 +2134,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); } } @@ -2113,7 +2185,7 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2137,7 +2209,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); - btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); @@ -2197,7 +2269,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } spin_lock(&fs_info->trans_lock); - if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; add_pending_snapshot(trans); @@ -2209,7 +2281,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) want_state = TRANS_STATE_SUPER_COMMITTED; btrfs_trans_state_lockdep_release(fs_info, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2221,9 +2293,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - cur_trans->state = TRANS_STATE_COMMIT_START; + cur_trans->state = TRANS_STATE_COMMIT_PREP; wake_up(&fs_info->transaction_blocked_wait); - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2244,11 +2316,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) goto lockdep_release; - } else { - spin_unlock(&fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); } } else { - spin_unlock(&fs_info->trans_lock); /* * The previous transaction was aborted and was already removed * from the list of transactions at fs_info->trans_list. So we @@ -2256,11 +2326,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * corrupt state (pointing to trees with unwritten nodes/leafs). */ if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; } } + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&fs_info->transaction_blocked_wait); + spin_unlock(&fs_info->trans_lock); + /* * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit @@ -2378,7 +2453,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) goto unlock_reloc; @@ -2511,7 +2586,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); - fs_info->last_trans_committed = cur_trans->transid; + btrfs_set_last_trans_committed(fs_info, cur_trans->transid); /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2570,7 +2645,7 @@ lockdep_release: goto cleanup_transaction; lockdep_trans_commit_start_release: - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_end_transaction(trans); return ret; } @@ -2629,18 +2704,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) */ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit) + unsigned int line, int error, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) + WRITE_ONCE(trans->aborted, error); + WRITE_ONCE(trans->transaction->aborted, error); + if (first_hit && error == -ENOSPC) btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); + __btrfs_handle_fs_error(fs_info, function, line, error, NULL); } int __init btrfs_transaction_init(void) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8e9fa23bd7fe..2bf8bbdfd0b3 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,8 +12,12 @@ #include "ctree.h" #include "misc.h" +/* Radix-tree tag for roots that are part of the trasaction. */ +#define BTRFS_ROOT_TRANS_TAG 0 + enum btrfs_trans_state { TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_PREP, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, @@ -117,8 +121,10 @@ enum { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 delayed_refs_bytes_reserved; u64 chunk_bytes_reserved; unsigned long delayed_ref_updates; + unsigned long delayed_ref_csum_deletions; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; @@ -138,6 +144,7 @@ struct btrfs_trans_handle { bool in_fsync; struct btrfs_fs_info *fs_info; struct list_head new_bgs; + struct btrfs_block_rsv delayed_rsv; }; /* @@ -171,7 +178,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, { spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; - inode->last_sub_trans = inode->root->log_transid; + inode->last_sub_trans = btrfs_get_root_log_transid(inode->root); inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } @@ -199,32 +206,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int errno); +bool __cold abort_should_print_stack(int error); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact stack trace is reported for some errors. */ -#define btrfs_abort_transaction(trans, errno) \ +#define btrfs_abort_transaction(trans, error) \ do { \ bool first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ first = true; \ - if (WARN(abort_should_print_stack(errno), \ + if (WARN(abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ + (error))) { \ /* Stack trace printed. */ \ } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (error)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ + __LINE__, (error), first); \ } while (0) int btrfs_end_transaction(struct btrfs_trans_handle *trans); @@ -242,7 +249,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root); void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); @@ -263,7 +269,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit); + unsigned int line, int error, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ab08a0b01311..50fdc69fdddf 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -29,6 +29,9 @@ #include "accessors.h" #include "file-item.h" #include "inode-item.h" +#include "dir-item.h" +#include "raid-stripe-tree.h" +#include "extent-tree.h" /* * Error message should follow the following format: @@ -1274,6 +1277,8 @@ static int check_extent_item(struct extent_buffer *leaf, unsigned long ptr; /* Current pointer inside inline refs */ unsigned long end; /* Extent item end */ const u32 item_size = btrfs_item_size(leaf, slot); + u8 last_type = 0; + u64 last_seq = U64_MAX; u64 flags; u64 generation; u64 total_refs; /* Total refs in btrfs_extent_item */ @@ -1320,6 +1325,18 @@ static int check_extent_item(struct extent_buffer *leaf, * 2.2) Ref type specific data * Either using btrfs_extent_inline_ref::offset, or specific * data structure. + * + * All above inline items should follow the order: + * + * - All btrfs_extent_inline_ref::type should be in an ascending + * order + * + * - Within the same type, the items should follow a descending + * order by their sequence number. The sequence number is + * determined by: + * * btrfs_extent_inline_ref::offset for all types other than + * EXTENT_DATA_REF + * * hash_extent_data_ref() for EXTENT_DATA_REF */ if (unlikely(item_size < sizeof(*ei))) { extent_err(leaf, slot, @@ -1401,6 +1418,7 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + u64 seq; u64 dref_offset; u64 inline_offset; u8 inline_type; @@ -1414,6 +1432,7 @@ static int check_extent_item(struct extent_buffer *leaf, iref = (struct btrfs_extent_inline_ref *)ptr; inline_type = btrfs_extent_inline_ref_type(leaf, iref); inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); + seq = inline_offset; if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", @@ -1444,6 +1463,10 @@ static int check_extent_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); + seq = hash_extent_data_ref( + btrfs_extent_data_ref_root(leaf, dref), + btrfs_extent_data_ref_objectid(leaf, dref), + btrfs_extent_data_ref_offset(leaf, dref)); if (unlikely(!IS_ALIGNED(dref_offset, fs_info->sectorsize))) { extent_err(leaf, slot, @@ -1465,11 +1488,32 @@ static int check_extent_item(struct extent_buffer *leaf, } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: extent_err(leaf, slot, "unknown inline ref type: %u", inline_type); return -EUCLEAN; } + if (inline_type < last_type) { + extent_err(leaf, slot, + "inline ref out-of-order: has type %u, prev type %u", + inline_type, last_type); + return -EUCLEAN; + } + /* Type changed, allow the sequence starts from U64_MAX again. */ + if (inline_type > last_type) + last_seq = U64_MAX; + if (seq > last_seq) { + extent_err(leaf, slot, +"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", + inline_type, inline_offset, seq, + last_type, last_seq); + return -EUCLEAN; + } + last_type = inline_type; + last_seq = seq; ptr += btrfs_extent_inline_ref_size(inline_type); } /* No padding is allowed */ @@ -1631,6 +1675,44 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_raid_stripe_extent(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) +{ + struct btrfs_stripe_extent *stripe_extent = + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { + generic_err(leaf, slot, +"invalid key objectid for raid stripe extent, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + + if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) { + generic_err(leaf, slot, + "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset"); + return -EUCLEAN; + } + + switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) { + case BTRFS_STRIPE_RAID0: + case BTRFS_STRIPE_RAID1: + case BTRFS_STRIPE_DUP: + case BTRFS_STRIPE_RAID10: + case BTRFS_STRIPE_RAID5: + case BTRFS_STRIPE_RAID6: + case BTRFS_STRIPE_RAID1C3: + case BTRFS_STRIPE_RAID1C4: + break; + default: + generic_err(leaf, slot, "invalid raid stripe encoding %u", + btrfs_stripe_extent_encoding(leaf, stripe_extent)); + return -EUCLEAN; + } + + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -1685,6 +1767,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: ret = check_extent_data_ref(leaf, key, slot); break; + case BTRFS_RAID_STRIPE_KEY: + ret = check_raid_stripe_extent(leaf, key, slot); + break; } if (ret) @@ -2005,7 +2090,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * So we only checks tree blocks which is read from disk, whose * generation <= fs_info->last_trans_committed. */ - if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info)) return 0; /* We have @first_key, so this @eb must have at least one item */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 365a1cc0a3c3..7d6729d9fd2f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -347,8 +347,7 @@ static int process_one_buffer(struct btrfs_root *log, } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, - eb->len); + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); if (ret) return ret; @@ -504,9 +503,9 @@ insert: found_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(path, item_size, 1); + btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(path, item_size - found_size); + btrfs_extend_item(trans, path, item_size - found_size); } else if (ret) { return ret; } @@ -574,7 +573,7 @@ insert: } } no_copy: - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -767,7 +766,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - ins.objectid, ins.offset, 0); + ins.objectid, ins.offset, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, key->objectid, offset, 0, false); @@ -890,7 +890,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); out: iput(inode); return ret; @@ -1445,7 +1445,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -1483,8 +1483,7 @@ out: return ret; } -static int count_inode_extrefs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret = 0; int name_len; @@ -1498,8 +1497,8 @@ static int count_inode_extrefs(struct btrfs_root *root, struct extent_buffer *leaf; while (1) { - ret = btrfs_find_one_extref(root, inode_objectid, offset, path, - &extref, &offset); + ret = btrfs_find_one_extref(inode->root, inode_objectid, offset, + path, &extref, &offset); if (ret) break; @@ -1527,8 +1526,7 @@ static int count_inode_extrefs(struct btrfs_root *root, return nlink; } -static int count_inode_refs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret; struct btrfs_key key; @@ -1543,7 +1541,7 @@ static int count_inode_refs(struct btrfs_root *root, key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); if (ret < 0) break; if (ret > 0) { @@ -1595,9 +1593,9 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret; u64 nlink = 0; @@ -1607,13 +1605,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = count_inode_refs(root, BTRFS_I(inode), path); + ret = count_inode_refs(BTRFS_I(inode), path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(root, BTRFS_I(inode), path); + ret = count_inode_extrefs(BTRFS_I(inode), path); if (ret < 0) goto out; @@ -1623,7 +1621,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -1685,7 +1683,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; } - ret = fixup_inode_link_count(trans, root, inode); + ret = fixup_inode_link_count(trans, inode); iput(inode); if (ret) break; @@ -1732,7 +1730,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, set_nlink(inode, 1); else inc_nlink(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; } @@ -1939,7 +1937,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_update_inode(trans, BTRFS_I(dir)); } kfree(name.name); iput(dir); @@ -2483,7 +2481,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, - root, BTRFS_I(inode)); + BTRFS_I(inode)); } iput(inode); if (ret) @@ -2574,7 +2572,7 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, btrfs_tree_unlock(eb); if (trans) { - ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); + ret = btrfs_pin_reserved_extent(trans, eb); if (ret) return ret; btrfs_redirty_list_add(trans->transaction, eb); @@ -2848,10 +2846,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, } /* - * btrfs_sync_log does sends a given tree log down to the disk and - * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk only - * if it returns 0. + * Sends a given tree log down to the disk and updates the super blocks to + * record it. When this call is done, you know that any inodes previously + * logged are safely on disk only if it returns 0. * * Any other return value means you need to call btrfs_commit_transaction. * Some of the edge cases for fsyncing directories that have had unlinks @@ -2961,7 +2958,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); - root->log_transid++; + btrfs_set_root_log_transid(root, root->log_transid + 1); log->log_transid = root->log_transid; root->log_start_pid = 0; /* @@ -2999,9 +2996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = update_log_root(trans, log, &new_root_item); if (ret) { - if (!list_empty(&root_log_ctx.list)) - list_del_init(&root_log_ctx.list); - + list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) @@ -3021,7 +3016,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); ret = btrfs_wait_tree_log_extents(log, mark); @@ -3136,8 +3130,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * someone else already started it. We use <= and not < because the * first log transaction has an ID of 0. */ - ASSERT(root->last_log_commit <= log_transid); - root->last_log_commit = log_transid; + ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); + btrfs_set_root_last_log_commit(root, log_transid); out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); @@ -3211,8 +3205,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + extent_io_tree_release(&log->dirty_log_pages); extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); @@ -3530,7 +3523,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, last_offset = max(last_offset, curr_end); } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -4138,19 +4131,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime_nsec(inode)); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4488,7 +4481,7 @@ copy_item: dst_index++; } - btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); btrfs_release_path(dst_path); out: kfree(ins_data); @@ -4693,7 +4686,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(fi)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -4722,7 +4715,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; int ins_nr = 0; - int start_slot; + int start_slot = 0; int ret; if (!(inode->flags & BTRFS_INODE_PREALLOC)) @@ -4841,13 +4834,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; - struct list_head extents; + LIST_HEAD(extents); struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; int num = 0; - INIT_LIST_HEAD(&extents); - write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) { @@ -4923,12 +4914,12 @@ process: set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); atomic_inc(&trans->transaction->pending_ordered); } - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); } btrfs_put_ordered_extent(ordered); } @@ -6794,8 +6785,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, while (true) { struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; + struct extent_buffer *leaf; + int slot; struct btrfs_key search_key; struct inode *inode; u64 ino; @@ -7206,9 +7197,7 @@ again: * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(trans, - log->node->start, - log->node->len); + ret = btrfs_pin_extent_for_log_replay(trans, log->node); btrfs_put_root(log); if (!ret) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 33606025513d..b4ac2b0cd235 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -223,7 +223,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, } /* - * ulist_del - delete one node from ulist + * Delete one node from ulist. + * * @ulist: ulist to remove node from * @val: value to delete * @aux: aux to delete diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 7c7001f42b14..5be74f9e47eb 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -124,7 +124,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, * An item with that type already exists. * Extend the item and store the new subid at the end. */ - btrfs_extend_item(path, sizeof(subid_le)); + btrfs_extend_item(trans, path, sizeof(subid_le)); eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); @@ -139,7 +139,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -221,7 +221,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, move_src = offset + sizeof(subid); move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); memmove_extent_buffer(eb, move_dst, move_src, move_len); - btrfs_truncate_item(path, item_size - sizeof(subid), 1); + btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1); out: btrfs_free_path(path); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index c5ff16f9e9fa..66e2270b0dae 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -487,7 +487,7 @@ static int rollback_verity(struct btrfs_inode *inode) } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -554,7 +554,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, } inode->ro_flags |= BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; ret = del_orphan(trans, inode); @@ -715,7 +715,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; u64 off = (u64)index << PAGE_SHIFT; loff_t merkle_pos = merkle_file_pos(inode); int ret; @@ -726,29 +726,36 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, return ERR_PTR(-EFBIG); index += merkle_pos >> PAGE_SHIFT; again: - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (page) { - if (PageUptodate(page)) - return page; + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (!IS_ERR(folio)) { + if (folio_test_uptodate(folio)) + goto out; - lock_page(page); - /* - * We only insert uptodate pages, so !Uptodate has to be - * an error - */ - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + folio_lock(folio); + /* If it's not uptodate after we have the lock, we got a read error. */ + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } - unlock_page(page); - return page; + folio_unlock(folio); + goto out; } - page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), + 0); + if (!folio) return ERR_PTR(-ENOMEM); + ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS); + if (ret) { + folio_put(folio); + /* Did someone else insert a folio here? */ + if (ret == -EEXIST) + goto again; + return ERR_PTR(ret); + } + /* * Merkle item keys are indexed from byte 0 in the merkle tree. * They have the form: @@ -756,28 +763,19 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - page_address(page), PAGE_SIZE, page); + folio_address(folio), PAGE_SIZE, &folio->page); if (ret < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(ret); } if (ret < PAGE_SIZE) - memzero_page(page, ret, PAGE_SIZE - ret); + folio_zero_segment(folio, ret, PAGE_SIZE); - SetPageUptodate(page); - ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS); + folio_mark_uptodate(folio); + folio_unlock(folio); - if (!ret) { - /* Inserted and ready for fsverity */ - unlock_page(page); - } else { - put_page(page); - /* Did someone race us into inserting this page? */ - if (ret == -EEXIST) - goto again; - page = ERR_PTR(ret); - } - return page; +out: + return folio_file_page(folio, index); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6aa9bf3661ac..f627674b37db 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "relocation.h" #include "scrub.h" #include "super.h" +#include "raid-stripe-tree.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ @@ -357,21 +358,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) } /* - * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the UUID to fs_devices::fsid - * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid + * Allocate new btrfs_fs_devices structure identified by a fsid. + * + * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to + * fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, - const u8 *metadata_fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; - ASSERT(fsid || !metadata_fsid); - fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -385,8 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - memcpy(fs_devs->metadata_uuid, - metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); } return fs_devs; @@ -457,91 +455,41 @@ static noinline struct btrfs_fs_devices *find_fsid( return NULL; } -/* - * First check if the metadata_uuid is different from the fsid in the given - * fs_devices. Then check if the given fsid is the same as the metadata_uuid - * in the fs_devices. If it is, return true; otherwise, return false. - */ -static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, - const u8 *fsid) -{ - return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; -} - -static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( - struct btrfs_super_block *disk_super) -{ - - struct btrfs_fs_devices *fs_devices; - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by first scanning - * a device which didn't have its fsid/metadata_uuid changed - * at all and the CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, - fs_devices->fsid)) - return fs_devices; - } - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by a device that - * has an outdated pair of fsid/metadata_uuid and - * CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, disk_super->metadata_uuid); -} - - static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct block_device **bdev, + int flush, struct bdev_handle **bdev_handle, struct btrfs_super_block **disk_super) { + struct block_device *bdev; int ret; - *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); + *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev)) { - ret = PTR_ERR(*bdev); + if (IS_ERR(*bdev_handle)) { + ret = PTR_ERR(*bdev_handle); goto error; } + bdev = (*bdev_handle)->bdev; if (flush) - sync_blockdev(*bdev); - ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); + sync_blockdev(bdev); + ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - blkdev_put(*bdev, holder); + bdev_release(*bdev_handle); goto error; } - invalidate_bdev(*bdev); - *disk_super = btrfs_read_dev_super(*bdev); + invalidate_bdev(bdev); + *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - blkdev_put(*bdev, holder); + bdev_release(*bdev_handle); goto error; } return 0; error: - *bdev = NULL; + *bdev_handle = NULL; return ret; } @@ -562,13 +510,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; - int ret = 0; + int ret; + bool freed = false; lockdep_assert_held(&uuid_mutex); - if (devt) - ret = -ENOENT; - + /* Return good status if there is no instance of devt. */ + ret = 0; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); @@ -579,8 +527,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device if (devt && devt != device->devt) continue; if (fs_devices->opened) { - /* for an already deleted device return 0 */ - if (devt && ret != 0) + if (devt) ret = -EBUSY; break; } @@ -590,7 +537,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device list_del(&device->dev_list); btrfs_free_device(device); - ret = 0; + freed = true; } mutex_unlock(&fs_devices->device_list_mutex); @@ -601,9 +548,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device } } + /* If there is at least one freed device return 0. */ + if (freed) + return 0; + return ret; } +static struct btrfs_fs_devices *find_fsid_by_device( + struct btrfs_super_block *disk_super, + dev_t devt, bool *same_fsid_diff_dev) +{ + struct btrfs_fs_devices *fsid_fs_devices; + struct btrfs_fs_devices *devt_fs_devices; + const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool found_by_devt = false; + + /* Find the fs_device by the usual method, if found use it. */ + fsid_fs_devices = find_fsid(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); + + /* The temp_fsid feature is supported only with single device filesystem. */ + if (btrfs_super_num_devices(disk_super) != 1) + return fsid_fs_devices; + + /* + * A seed device is an integral component of the sprout device, which + * functions as a multi-device filesystem. So, temp-fsid feature is + * not supported. + */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) + return fsid_fs_devices; + + /* Try to find a fs_devices by matching devt. */ + list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { + struct btrfs_device *device; + + list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { + if (device->devt == devt) { + found_by_devt = true; + break; + } + } + if (found_by_devt) + break; + } + + if (found_by_devt) { + /* Existing device. */ + if (fsid_fs_devices == NULL) { + if (devt_fs_devices->opened == 0) { + /* Stale device. */ + return NULL; + } else { + /* temp_fsid is mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* Regular or temp_fsid device mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* New device. */ + if (fsid_fs_devices == NULL) { + return NULL; + } else { + /* sb::fsid is already used create a new temp_fsid. */ + *same_fsid_diff_dev = true; + return NULL; + } + } + + /* Not reached. */ +} + /* * This is only used on mount, and we are protected from competing things * messing with our fs_devices by the uuid_mutex, thus we do not need the @@ -613,7 +632,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -624,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &disk_super); + &bdev_handle, &disk_super); if (ret) return ret; @@ -648,21 +667,21 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev)) + if (bdev_read_only(bdev_handle->bdev)) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(bdev_handle->bdev)) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev)) + if (bdev_max_discard_sectors(bdev_handle->bdev)) fs_devices->discardable = true; - device->bdev = bdev; + device->bdev_handle = bdev_handle; + device->bdev = bdev_handle->bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - device->holder = holder; fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && @@ -676,89 +695,19 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - blkdev_put(bdev, holder); + bdev_release(bdev_handle); return -EINVAL; } -/* - * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices - * being created with a disk that has already completed its fsid change. Such - * disk can belong to an fs which has its FSID changed or to one which doesn't. - * Handle both cases here. - */ -static struct btrfs_fs_devices *find_fsid_inprogress( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, NULL); -} - -static struct btrfs_fs_devices *find_fsid_changed( - struct btrfs_super_block *disk_super) +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) { - struct btrfs_fs_devices *fs_devices; - - /* - * Handles the case where scanned device is part of an fs that had - * multiple successful changes of FSID but currently device didn't - * observe it. Meaning our fsid will be different than theirs. We need - * to handle two subcases : - * 1 - The fs still continues to have different METADATA/FSID uuids. - * 2 - The fs is switched back to its original FSID (METADATA/FSID - * are equal). - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - /* Changed UUIDs */ - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && - memcmp(fs_devices->fsid, disk_super->fsid, - BTRFS_FSID_SIZE) != 0) - return fs_devices; + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); - /* Unchanged UUIDs */ - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, disk_super->metadata_uuid, - BTRFS_FSID_SIZE) == 0) - return fs_devices; - } - - return NULL; + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } -static struct btrfs_fs_devices *find_fsid_reverted_metadata( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - /* - * Handle the case where the scanned device is part of an fs whose last - * metadata UUID change reverted it to the original FSID. At the same - * time fs_devices was first created by another constituent device - * which didn't fully observe the operation. This results in an - * btrfs_fs_devices created with metadata/fsid different AND - * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the - * fs_devices equal to the FSID of the disk. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return NULL; -} /* * Add new device to list of registered devices * @@ -777,10 +726,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; int error; + bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); - bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & - BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + btrfs_err(NULL, +"device %s has incomplete metadata_uuid change, please use btrfstune to complete", + path); + return ERR_PTR(-EAGAIN); + } error = lookup_bdev(path, &path_devt); if (error) { @@ -789,27 +744,23 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(error); } - if (fsid_change_in_progress) { - if (!has_metadata_uuid) - fs_devices = find_fsid_inprogress(disk_super); - else - fs_devices = find_fsid_changed(disk_super); - } else if (has_metadata_uuid) { - fs_devices = find_fsid_with_metadata_uuid(disk_super); - } else { - fs_devices = find_fsid_reverted_metadata(disk_super); - if (!fs_devices) - fs_devices = find_fsid(disk_super->fsid, NULL); - } - + fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid, - has_metadata_uuid ? disk_super->metadata_uuid : NULL); + fs_devices = alloc_fs_devices(disk_super->fsid); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); - fs_devices->fsid_change = fsid_change_in_progress; + if (has_metadata_uuid) + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + if (same_fsid_diff_dev) { + generate_random_uuid(fs_devices->fsid); + fs_devices->temp_fsid = true; + pr_info("BTRFS: device %s using temp-fsid %pU\n", + path, fs_devices->fsid); + } mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); @@ -824,25 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, &args); - /* - * If this disk has been pulled into an fs devices created by - * a device which had the CHANGING_FSID_V2 flag then replace the - * metadata_uuid/fsid values of the fs_devices. - */ - if (fs_devices->fsid_change && - found_transid > fs_devices->latest_generation) { + if (found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); - - if (has_metadata_uuid) - memcpy(fs_devices->metadata_uuid, - disk_super->metadata_uuid, - BTRFS_FSID_SIZE); - else - memcpy(fs_devices->metadata_uuid, - disk_super->fsid, BTRFS_FSID_SIZE); - - fs_devices->fsid_change = false; + memcpy(fs_devices->metadata_uuid, + btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); } } @@ -851,8 +788,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, - "device %s belongs to fsid %pU, and the fs is already mounted", - path, fs_devices->fsid); +"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, fs_devices->fsid, current->comm, + task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } @@ -995,7 +933,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) lockdep_assert_held(&uuid_mutex); - fs_devices = alloc_fs_devices(orig->fsid, NULL); + fs_devices = alloc_fs_devices(orig->fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -1066,9 +1004,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev) { - blkdev_put(device->bdev, device->holder); + if (device->bdev_handle) { + bdev_release(device->bdev_handle); device->bdev = NULL; + device->bdev_handle = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1113,7 +1052,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - blkdev_put(device->bdev, device->holder); + bdev_release(device->bdev_handle); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1354,14 +1293,19 @@ int btrfs_forget_devices(dev_t devt) /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock - * is read via pagecache + * is read via pagecache. + * + * With @mount_arg_dev it's a scan during mount time that will always register + * the device or return an error. Multi-device and seeding devices are registered + * in both cases. */ -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev) { struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct block_device *bdev; + struct bdev_handle *bdev_handle; u64 bytenr, bytenr_orig; int ret; @@ -1384,31 +1328,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev = blkdev_get_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_handle)) + return ERR_CAST(bdev_handle); bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); + disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, + bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } + if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && + !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) { + dev_t devt; + + ret = lookup_bdev(path, &devt); + if (ret) + btrfs_warn(NULL, "lookup bdev failed for path %s: %d", + path, ret); + else + btrfs_free_stale_devices(devt, NULL); + + pr_debug("BTRFS: skip registering single non-seed device %s\n", path); + device = NULL; + goto free_disk_super; + } + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); +free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - blkdev_put(bdev, NULL); + bdev_release(bdev_handle); return device; } @@ -1424,9 +1386,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (!find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, @@ -1438,18 +1400,18 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, return false; } -static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); + return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular * allocator, because we anyway use/reserve the first two zones * for superblock logging. */ - return ALIGN(start, device->zone_info->zone_size); + return 0; default: BUG(); } @@ -1581,36 +1543,35 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * correct usable device space, as device extent freed in current transaction * is not reported as available. */ -static int find_free_dev_extent_start(struct btrfs_device *device, - u64 num_bytes, u64 search_start, u64 *start, - u64 *len) +static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; + u64 search_start; u64 hole_size; u64 max_hole_start; - u64 max_hole_size; + u64 max_hole_size = 0; u64 extent_end; u64 search_end = device->total_bytes; int ret; int slot; struct extent_buffer *l; - search_start = dev_extent_search_start(device, search_start); + search_start = dev_extent_search_start(device); + max_hole_start = search_start; WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - max_hole_start = search_start; - max_hole_size = 0; - + if (!path) { + ret = -ENOMEM; + goto out; + } again: if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { @@ -1725,13 +1686,6 @@ out: return ret; } -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) -{ - /* FIXME use last free of some kind */ - return find_free_dev_extent_start(device, num_bytes, 0, start, len); -} - static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) @@ -1900,7 +1854,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -1917,15 +1871,13 @@ out: static void update_dev_time(const char *device_path) { struct path path; - struct timespec64 now; int ret; ret = kern_path(device_path, LOOKUP_FOLLOW, &path); if (ret) return; - now = current_time(d_inode(path.dentry)); - inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION); + inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); path_put(&path); } @@ -2095,7 +2047,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder) + struct bdev_handle **bdev_handle) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2204,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev) { + if (device->bdev_handle) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2220,9 +2172,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and blkdev_put() will pull in the ->open_mutex on the - * block device and it's dependencies. Instead just flush the device - * and let the caller do the final blkdev_put. + * write lock, and bdev_release() will pull in the ->open_mutex on + * the block device and it's dependencies. Instead just flush the + * device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device->bdev, @@ -2233,8 +2185,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } } - *bdev = device->bdev; - *holder = device->holder; + *bdev_handle = device->bdev_handle; synchronize_rcu(); btrfs_free_device(device); @@ -2371,7 +2322,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct block_device *bdev; + struct bdev_handle *bdev_handle; int ret; if (!path || !path[0]) @@ -2389,7 +2340,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev, &disk_super); + &bdev_handle, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2402,7 +2353,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - blkdev_put(bdev, NULL); + bdev_release(bdev_handle); return 0; } @@ -2459,7 +2410,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) * Private copy of the seed devices, anchored at * fs_info->fs_devices->seed_list */ - seed_devices = alloc_fs_devices(NULL, NULL); + seed_devices = alloc_fs_devices(NULL); if (IS_ERR(seed_devices)) return seed_devices; @@ -2605,7 +2556,7 @@ next_slot: if (device->fs_devices->seeding) { btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } path->slots[0]++; @@ -2622,7 +2573,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2635,12 +2586,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_handle)) + return PTR_ERR(bdev_handle); - if (!btrfs_check_device_zone_type(fs_info, bdev)) { + if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { ret = -EINVAL; goto error; } @@ -2652,11 +2603,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev); + sync_blockdev(bdev_handle->bdev); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev) { + if (device->bdev == bdev_handle->bdev) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2672,7 +2623,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev = bdev; + device->bdev_handle = bdev_handle; + device->bdev = bdev_handle->bdev; ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2693,12 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; device->total_bytes = - round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); + round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2734,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2856,7 +2807,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - blkdev_put(bdev, fs_info->bdev_holder); + bdev_release(bdev_handle); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -2903,7 +2854,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); @@ -2937,6 +2888,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; + atomic64_add(diff, &fs_info->free_chunk_space); btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); @@ -3035,7 +2987,8 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) } /* - * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * Find the mapping containing the given logical extent. + * * @logical: Logical block offset in bytes. * @length: Length of extent in bytes. * @@ -3053,15 +3006,16 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, read_unlock(&em_tree->lock); if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu length %llu", + btrfs_crit(fs_info, + "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (em->start > logical || em->start + em->len < logical) { + if (em->start > logical || em->start + em->len <= logical) { btrfs_crit(fs_info, - "found a bad mapping, wanted %llu-%llu, found %llu-%llu", - logical, length, em->start, em->start + em->len); + "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", + logical, logical + length, em->start, em->start + em->len); free_extent_map(em); return ERR_PTR(-EINVAL); } @@ -3491,7 +3445,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_flags(leaf, item, bctl->flags); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -4846,6 +4800,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; u64 start; + u64 free_diff = 0; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; @@ -4871,7 +4826,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; - atomic64_sub(diff, &fs_info->free_chunk_space); + + /* + * The new free_chunk_space is new_size - used, so we have to + * subtract the delta of the old free_chunk_space which included + * old_size - used. If used > new_size then just subtract this + * entire device's free space. + */ + if (device->bytes_used < new_size) + free_diff = (old_size - device->bytes_used) - + (new_size - device->bytes_used); + else + free_diff = old_size - device->bytes_used; + atomic64_sub(free_diff, &fs_info->free_chunk_space); } /* @@ -5006,9 +4973,10 @@ done: if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes += diff; - atomic64_add(diff, &fs_info->free_chunk_space); + atomic64_add(free_diff, &fs_info->free_chunk_space); + } mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -5117,7 +5085,7 @@ static void init_alloc_chunk_ctl_policy_regular( ASSERT(space_info); ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); - ctl->max_stripe_size = ctl->max_chunk_size; + ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); @@ -5888,6 +5856,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, } static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -5907,6 +5876,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ bioc->fs_info = fs_info; bioc->replace_stripe_src = -1; bioc->full_stripe_logical = (u64)-1; + bioc->logical = logical; return bioc; } @@ -6211,19 +6181,61 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, return U64_MAX; } -static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, - u32 stripe_index, u64 stripe_offset, u32 stripe_nr) +static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, struct btrfs_io_stripe *dst, + struct map_lookup *map, u32 stripe_index, + u64 stripe_offset, u64 stripe_nr) { dst->dev = map->stripes[stripe_index].dev; + + if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type)) + return btrfs_get_raid_extent_offset(fs_info, logical, length, + map->type, stripe_index, dst); + dst->physical = map->stripes[stripe_index].physical + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + return 0; } +/* + * Map one logical range to one or more physical ranges. + * + * @length: (Mandatory) mapped length of this run. + * One logical range can be split into different segments + * due to factors like zones and RAID0/5/6/10 stripe + * boundaries. + * + * @bioc_ret: (Mandatory) returned btrfs_io_context structure. + * which has one or more physical ranges (btrfs_io_stripe) + * recorded inside. + * Caller should call btrfs_put_bioc() to free it after use. + * + * @smap: (Optional) single physical range optimization. + * If the map request can be fulfilled by one single + * physical range, and this is parameter is not NULL, + * then @bioc_ret would be NULL, and @smap would be + * updated. + * + * @mirror_num_ret: (Mandatory) returned mirror number if the original + * value is 0. + * + * Mirror number 0 means to choose any live mirrors. + * + * For non-RAID56 profiles, non-zero mirror_num means + * the Nth mirror. (e.g. mirror_num 1 means the first + * copy). + * + * For RAID56 profile, mirror 1 means rebuild from P and + * the remaining data stripes. + * + * For RAID6 profile, mirror > 2 means mark another + * data/P stripe error and rebuild from the remaining + * stripes.. + */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map) + struct btrfs_io_stripe *smap, int *mirror_num_ret) { struct extent_map *em; struct map_lookup *map; @@ -6318,8 +6330,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { + if (op != BTRFS_MAP_READ || mirror_num > 1) { /* + * Needs full stripe mapping. + * * Push stripe_nr back to the start of the full stripe * For those cases needing a full stripe, @stripe_nr * is the full stripe number. @@ -6342,19 +6356,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, stripe_index = 0; stripe_offset = 0; } else { - /* - * Mirror #0 or #1 means the original data block. - * Mirror #2 is RAID5 parity block. - * Mirror #3 is RAID6 Q block. - */ + ASSERT(mirror_num <= 1); + /* Just grab the data stripe directly. */ stripe_index = stripe_nr % data_stripes; stripe_nr /= data_stripes; - if (mirror_num > 1) - stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (op == BTRFS_MAP_READ && mirror_num <= 1) + if (op == BTRFS_MAP_READ && mirror_num < 1) mirror_num = 1; } } else { @@ -6393,18 +6402,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * I/O context structure. */ if (smap && num_alloc_stripes == 1 && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && - (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || - !dev_replace->tgtdev)) { - set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + !(btrfs_need_stripe_tree_update(fs_info, map->type) && + op != BTRFS_MAP_READ) && + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { + ret = set_io_stripe(fs_info, op, logical, length, smap, map, + stripe_index, stripe_offset, stripe_nr); if (mirror_num_ret) *mirror_num_ret = mirror_num; *bioc_ret = NULL; - ret = 0; goto out; } - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); + bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; @@ -6418,7 +6427,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * * It's still mostly the same as other profiles, just with extra rotation. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes @@ -6430,22 +6439,35 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ bioc->full_stripe_logical = em->start + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); - for (i = 0; i < num_stripes; i++) - set_io_stripe(&bioc->stripes[i], map, - (i + stripe_nr) % num_stripes, - stripe_offset, stripe_nr); + for (int i = 0; i < num_stripes; i++) { + ret = set_io_stripe(fs_info, op, logical, length, + &bioc->stripes[i], map, + (i + stripe_nr) % num_stripes, + stripe_offset, stripe_nr); + if (ret < 0) + break; + } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ for (i = 0; i < num_stripes; i++) { - set_io_stripe(&bioc->stripes[i], map, stripe_index, - stripe_offset, stripe_nr); + ret = set_io_stripe(fs_info, op, logical, length, + &bioc->stripes[i], map, stripe_index, + stripe_offset, stripe_nr); + if (ret < 0) + break; stripe_index++; } } + if (ret) { + *bioc_ret = NULL; + btrfs_put_bioc(bioc); + goto out; + } + if (op != BTRFS_MAP_READ) max_errors = btrfs_chunk_max_errors(map); @@ -6872,7 +6894,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid, NULL); + fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -7505,7 +7527,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -8047,7 +8069,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ASSERT(mirror_num > 0); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, - &bioc, smap, &mirror_ret, true); + &bioc, smap, &mirror_ret); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b8c51f16ba86..9cc374864a79 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -90,13 +90,11 @@ struct btrfs_device { u64 generation; + struct bdev_handle *bdev_handle; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; - /* block device holder for blkdev_get/put */ - void *holder; - /* * Device's major-minor number. Must be set even if the device is not * opened (bdev == NULL), unless the device is missing. @@ -290,6 +288,19 @@ struct btrfs_fs_devices { * - Following shall be true at all times: * - metadata_uuid == btrfs_header::fsid * - metadata_uuid == btrfs_dev_item::fsid + * + * - Relations between fsid and metadata_uuid in sb and fs_devices: + * - Normal: + * fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid + * sb->metadata_uuid == 0 + * + * - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set: + * fs_devices->fsid == sb->fsid + * fs_devices->metadata_uuid == sb->metadata_uuid + * + * - When in-memory fs_devices->temp_fsid is true + * fs_devices->fsid = random + * fs_devices->metadata_uuid == sb->fsid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -353,9 +364,10 @@ struct btrfs_fs_devices { bool rotating; /* Devices support TRIM/discard commands. */ bool discardable; - bool fsid_change; /* The filesystem is a seed filesystem. */ bool seeding; + /* The mount needs to use a randomly generated fsid. */ + bool temp_fsid; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -381,12 +393,12 @@ struct btrfs_fs_devices { struct btrfs_io_stripe { struct btrfs_device *dev; - union { - /* Block mapping */ - u64 physical; - /* For the endio handler */ - struct btrfs_io_context *bioc; - }; + /* Block mapping. */ + u64 physical; + u64 length; + bool is_scrub; + /* For the endio handler. */ + struct btrfs_io_context *bioc; }; struct btrfs_discard_stripe { @@ -419,6 +431,11 @@ struct btrfs_io_context { atomic_t error; u16 max_errors; + u64 logical; + u64 size; + /* Raid stripe tree ordered entry. */ + struct list_head rst_ordered_entry; + /* * The total number of stripes, including the extra duplicated * stripe for replace. @@ -596,8 +613,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map); + struct btrfs_io_stripe *smap, int *mirror_num_ret); int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num); @@ -611,7 +627,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_tree_free(struct extent_map_tree *tree); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags); +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -629,7 +646,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder); + struct bdev_handle **bdev_handle); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -650,8 +667,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); @@ -749,5 +764,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fc4b20c2688a..3cf236fb40a4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -188,15 +188,15 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) - btrfs_extend_item(path, size - old_data_len); + btrfs_extend_item(trans, path, size - old_data_len); else if (size < old_data_len) - btrfs_truncate_item(path, data_size, 1); + btrfs_truncate_item(trans, path, data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; - btrfs_extend_item(path, data_size); + btrfs_extend_item(trans, path, data_size); } ptr = btrfs_item_ptr(leaf, slot, char); @@ -205,7 +205,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is @@ -264,8 +264,8 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, goto out; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -407,8 +407,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, ret = btrfs_set_prop(trans, inode, name, value, size, flags); if (!ret) { inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_ctime_current(inode); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); } @@ -442,7 +442,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = { .set = btrfs_xattr_handler_set_prop, }; -const struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler * const btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 1cd3fc0a8f17..118118ca3e1d 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -8,7 +8,7 @@ #include <linux/xattr.h> -extern const struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler * const btrfs_xattr_handlers[]; int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 72b90bc19a19..188378ca19c7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -65,6 +65,9 @@ #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) +static void wait_eb_writebacks(struct btrfs_block_group *block_group); +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); + static inline bool sb_zone_is_full(const struct blk_zone *zone) { return (zone->cond == BLK_ZONE_COND_FULL) || @@ -465,8 +468,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) * use the cache. */ if (populate_cache && bdev_is_zoned(device->bdev)) { - zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * - zone_info->nr_zones); + zone_info->zone_cache = vcalloc(zone_info->nr_zones, + sizeof(struct blk_zone)); if (!zone_info->zone_cache) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to allocate zone cache for %s", @@ -1279,21 +1282,284 @@ out: return ret; } +struct zone_info { + u64 physical; + u64 capacity; + u64 alloc_offset; +}; + +static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, + struct zone_info *info, unsigned long *active, + struct map_lookup *map) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *device = map->stripes[zone_idx].dev; + int dev_replace_is_ongoing = 0; + unsigned int nofs_flag; + struct blk_zone zone; + int ret; + + info->physical = map->stripes[zone_idx].physical; + + if (!device->bdev) { + info->alloc_offset = WP_MISSING_DEV; + return 0; + } + + /* Consider a zone as active if we can allow any number of active zones. */ + if (!device->zone_info->max_active_zones) + __set_bit(zone_idx, active); + + if (!btrfs_dev_is_sequential(device, info->physical)) { + info->alloc_offset = WP_CONVENTIONAL; + return 0; + } + + /* This zone will be used for allocation, so mark this zone non-empty. */ + btrfs_dev_clear_zone_empty(device, info->physical); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); + up_read(&dev_replace->rwsem); + + /* + * The group is mapped to a sequential zone. Get the zone write pointer + * to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, info->physical, &zone); + memalloc_nofs_restore(nofs_flag); + if (ret) { + if (ret != -EIO && ret != -EOPNOTSUPP) + return ret; + info->alloc_offset = WP_MISSING_DEV; + return 0; + } + + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + btrfs_err_in_rcu(fs_info, + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", + zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), + device->devid); + return -EIO; + } + + info->capacity = (zone.capacity << SECTOR_SHIFT); + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + (info->physical >> device->zone_info->zone_size_shift), + rcu_str_deref(device->name), device->devid); + info->alloc_offset = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + info->alloc_offset = 0; + break; + case BLK_ZONE_COND_FULL: + info->alloc_offset = info->capacity; + break; + default: + /* Partially used zone. */ + info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT); + __set_bit(zone_idx, active); + break; + } + + return 0; +} + +static int btrfs_load_block_group_single(struct btrfs_block_group *bg, + struct zone_info *info, + unsigned long *active) +{ + if (info->alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + info->physical); + return -EIO; + } + + bg->alloc_offset = info->alloc_offset; + bg->zone_capacity = info->capacity; + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + return 0; +} + +static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); + return -EINVAL; + } + + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + zone_info[0].physical); + return -EIO; + } + if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + zone_info[1].physical); + return -EIO; + } + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + btrfs_err(bg->fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + return -EIO; + } + + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else if (test_bit(0, active)) { + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + + bg->alloc_offset = zone_info[0].alloc_offset; + bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); + return 0; +} + +static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + int i; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in %s profile", + btrfs_bg_type_to_raid_name(map->type)); + return -EIO; + } + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg)) { + return -EIO; + } + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, + zone_info[1].capacity); + } + + if (zone_info[0].alloc_offset != WP_MISSING_DEV) + bg->alloc_offset = zone_info[0].alloc_offset; + else + bg->alloc_offset = zone_info[i - 1].alloc_offset; + + return 0; +} + +static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + + return 0; +} + +static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + + if ((i % map->sub_stripes) == 0) { + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + } + + return 0; +} + int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; + struct zone_info *zone_info = NULL; int ret; int i; - unsigned int nofs_flag; - u64 *alloc_offsets = NULL; - u64 *caps = NULL; - u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1325,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); - if (!alloc_offsets) { - ret = -ENOMEM; - goto out; - } - - caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); - if (!caps) { - ret = -ENOMEM; - goto out; - } - - physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); - if (!physical) { + zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); + if (!zone_info) { ret = -ENOMEM; goto out; } @@ -1350,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - bool is_sequential; - struct blk_zone zone; - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int dev_replace_is_ongoing = 0; - - device = map->stripes[i].dev; - physical[i] = map->stripes[i].physical; - - if (device->bdev == NULL) { - alloc_offsets[i] = WP_MISSING_DEV; - continue; - } - - is_sequential = btrfs_dev_is_sequential(device, physical[i]); - if (is_sequential) - num_sequential++; - else - num_conventional++; - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); - - if (!is_sequential) { - alloc_offsets[i] = WP_CONVENTIONAL; - continue; - } - - /* - * This zone will be used for allocation, so mark this zone - * non-empty. - */ - btrfs_dev_clear_zone_empty(device, physical[i]); - - down_read(&dev_replace->rwsem); - dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); - up_read(&dev_replace->rwsem); - - /* - * The group is mapped to a sequential zone. Get the zone write - * pointer to determine the allocation offset within the zone. - */ - WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); - nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical[i], &zone); - memalloc_nofs_restore(nofs_flag); - if (ret == -EIO || ret == -EOPNOTSUPP) { - ret = 0; - alloc_offsets[i] = WP_MISSING_DEV; - continue; - } else if (ret) { - goto out; - } - - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { - btrfs_err_in_rcu(fs_info, - "zoned: unexpected conventional zone %llu on device %s (devid %llu)", - zone.start << SECTOR_SHIFT, - rcu_str_deref(device->name), device->devid); - ret = -EIO; + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + if (ret) goto out; - } - - caps[i] = (zone.capacity << SECTOR_SHIFT); - switch (zone.cond) { - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - btrfs_err(fs_info, - "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical[i] >> device->zone_info->zone_size_shift, - rcu_str_deref(device->name), device->devid); - alloc_offsets[i] = WP_MISSING_DEV; - break; - case BLK_ZONE_COND_EMPTY: - alloc_offsets[i] = 0; - break; - case BLK_ZONE_COND_FULL: - alloc_offsets[i] = caps[i]; - break; - default: - /* Partially used zone */ - alloc_offsets[i] = - ((zone.wp - zone.start) << SECTOR_SHIFT); - __set_bit(i, active); - break; - } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + num_conventional++; + else + num_sequential++; } if (num_sequential > 0) @@ -1465,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ - if (alloc_offsets[0] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[0]); - ret = -EIO; - goto out; - } - cache->alloc_offset = alloc_offsets[0]; - cache->zone_capacity = caps[0]; - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); + ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - if (map->type & BTRFS_BLOCK_GROUP_DATA) { - btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); - ret = -EINVAL; - goto out; - } - if (alloc_offsets[0] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[0]); - ret = -EIO; - goto out; - } - if (alloc_offsets[1] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[1]); - ret = -EIO; - goto out; - } - if (alloc_offsets[0] != alloc_offsets[1]) { - btrfs_err(fs_info, - "zoned: write pointer offset mismatch of zones in DUP profile"); - ret = -EIO; - goto out; - } - if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(cache)) { - ret = -EIO; - goto out; - } - } else { - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, - &cache->runtime_flags); - } - cache->alloc_offset = alloc_offsets[0]; - cache->zone_capacity = min(caps[0], caps[1]); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID0: + ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID10: + ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: - /* non-single profiles are not supported yet */ default: btrfs_err(fs_info, "zoned: profile %s not yet supported", btrfs_bg_type_to_raid_name(map->type)); @@ -1530,13 +1661,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: - if (cache->alloc_offset > fs_info->zone_size) { - btrfs_err(fs_info, - "zoned: invalid write pointer %llu in block group %llu", - cache->alloc_offset, cache->start); - ret = -EIO; - } - if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", @@ -1567,9 +1691,7 @@ out: cache->physical_map = NULL; } bitmap_free(active); - kfree(physical); - kfree(caps); - kfree(alloc_offsets); + kfree(zone_info); free_extent_map(em); return ret; @@ -1583,19 +1705,9 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - - /* Check for block groups never get activated */ - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && - cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && - !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && - cache->alloc_offset == 0) { - unusable = cache->length; - free = 0; - } else { - unusable = (cache->alloc_offset - cache->used) + - (cache->length - cache->zone_capacity); - free = cache->zone_capacity - cache->alloc_offset; - } + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; @@ -1616,7 +1728,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY | EXTENT_NOWAIT, NULL); + EXTENT_DIRTY, NULL); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) @@ -1707,10 +1819,21 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_sum *sum = - list_first_entry(&ordered->list, typeof(*sum), list); - u64 logical = sum->logical; - u64 len = sum->len; + struct btrfs_ordered_sum *sum; + u64 logical, len; + + /* + * Write to pre-allocated region is for the data relocation, and so + * it should use WRITE operation. No split/rewrite are necessary. + */ + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) + return; + + ASSERT(!list_empty(&ordered->list)); + /* The ordered->list can be empty in the above pre-alloc case. */ + sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); + logical = sum->logical; + len = sum->len; while (len < ordered->disk_num_bytes) { sum = list_next_entry(sum, list); @@ -1747,41 +1870,121 @@ out: } } -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) +static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, + struct btrfs_block_group **active_bg) { - struct btrfs_block_group *cache; - bool ret = true; + const struct writeback_control *wbc = ctx->wbc; + struct btrfs_block_group *block_group = ctx->zoned_bg; + struct btrfs_fs_info *fs_info = block_group->fs_info; - if (!btrfs_is_zoned(fs_info)) + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) return true; - cache = btrfs_lookup_block_group(fs_info, eb->start); - if (!cache) - return true; + if (fs_info->treelog_bg == block_group->start) { + if (!btrfs_zone_activate(block_group)) { + int ret_fin = btrfs_zone_finish_one_bg(fs_info); - if (cache->meta_write_pointer != eb->start) { - btrfs_put_block_group(cache); - cache = NULL; - ret = false; - } else { - cache->meta_write_pointer = eb->start + eb->len; - } + if (ret_fin != 1 || !btrfs_zone_activate(block_group)) + return false; + } + } else if (*active_bg != block_group) { + struct btrfs_block_group *tgt = *active_bg; - *cache_ret = cache; + /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ + lockdep_assert_held(&fs_info->zoned_meta_io_lock); - return ret; + if (tgt) { + /* + * If there is an unsent IO left in the allocated area, + * we cannot wait for them as it may cause a deadlock. + */ + if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { + if (wbc->sync_mode == WB_SYNC_NONE || + (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) + return false; + } + + /* Pivot active metadata/system block group. */ + btrfs_zoned_meta_io_unlock(fs_info); + wait_eb_writebacks(tgt); + do_zone_finish(tgt, true); + btrfs_zoned_meta_io_lock(fs_info); + if (*active_bg == tgt) { + btrfs_put_block_group(tgt); + *active_bg = NULL; + } + } + if (!btrfs_zone_activate(block_group)) + return false; + if (*active_bg != block_group) { + ASSERT(*active_bg == NULL); + *active_bg = block_group; + btrfs_get_block_group(block_group); + } + } + + return true; } -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb) +/* + * Check if @ctx->eb is aligned to the write pointer. + * + * Return: + * 0: @ctx->eb is at the write pointer. You can write it. + * -EAGAIN: There is a hole. The caller should handle the case. + * -EBUSY: There is a hole, but the caller can just bail out. + */ +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { - if (!btrfs_is_zoned(eb->fs_info) || !cache) - return; + const struct writeback_control *wbc = ctx->wbc; + const struct extent_buffer *eb = ctx->eb; + struct btrfs_block_group *block_group = ctx->zoned_bg; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + if (block_group) { + if (block_group->start > eb->start || + block_group->start + block_group->length <= eb->start) { + btrfs_put_block_group(block_group); + block_group = NULL; + ctx->zoned_bg = NULL; + } + } + + if (!block_group) { + block_group = btrfs_lookup_block_group(fs_info, eb->start); + if (!block_group) + return 0; + ctx->zoned_bg = block_group; + } + + if (block_group->meta_write_pointer == eb->start) { + struct btrfs_block_group **tgt; - ASSERT(cache->meta_write_pointer == eb->start + eb->len); - cache->meta_write_pointer = eb->start; + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return 0; + + if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) + tgt = &fs_info->active_system_bg; + else + tgt = &fs_info->active_meta_bg; + if (check_bg_is_active(ctx, tgt)) + return 0; + } + + /* + * Since we may release fs_info->zoned_meta_io_lock, someone can already + * start writing this eb. In that case, we can just bail out. + */ + if (block_group->meta_write_pointer > eb->start) + return -EBUSY; + + /* If for_sync, this hole will be filled with trasnsaction commit. */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + return -EAGAIN; + return -EBUSY; } int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) @@ -1803,7 +2006,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, int i, ret; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bioc, NULL, NULL, 1); + &mapped_length, &bioc, NULL, NULL); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; @@ -1879,10 +2082,10 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; + const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); bool ret; int i; @@ -1891,7 +2094,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; - spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -1904,30 +2106,44 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { + struct btrfs_zoned_device_info *zinfo; + int reserved = 0; + device = map->stripes[i].dev; physical = map->stripes[i].physical; + zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; + if (is_data) + reserved = zinfo->reserved_active_zones; + /* + * For the data block group, leave active zones for one + * metadata block group and one system block group. + */ + if (atomic_read(&zinfo->active_zones_left) <= reserved) { + ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); + goto out_unlock; + } + if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } + if (!is_data) + zinfo->reserved_active_zones--; } + spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); - WARN_ON(block_group->alloc_offset != 0); - if (block_group->zone_unusable == block_group->length) { - block_group->zone_unusable = block_group->length - block_group->zone_capacity; - space_info->bytes_zone_unusable -= block_group->zone_capacity; - } spin_unlock(&block_group->lock); - btrfs_try_granting_tickets(fs_info, space_info); - spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -1940,7 +2156,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); return ret; } @@ -2006,6 +2221,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ * and block_group->meta_write_pointer for metadata. */ if (!fully_written) { + if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } spin_unlock(&block_group->lock); ret = btrfs_inc_block_group_ro(block_group, false); @@ -2034,7 +2253,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ return 0; } - if (block_group->reserved) { + if (block_group->reserved || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return -EAGAIN; @@ -2043,6 +2264,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; + if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) + block_group->meta_write_pointer = block_group->start + + block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); @@ -2052,18 +2276,21 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) return ret; + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } @@ -2102,8 +2329,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); + spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; + int reserved = 0; if (!device->bdev) continue; @@ -2113,17 +2342,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } + if (flags & BTRFS_BLOCK_GROUP_DATA) + reserved = zinfo->reserved_active_zones; + switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ - ret = (atomic_read(&zinfo->active_zones_left) >= 1); + ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); break; case BTRFS_BLOCK_GROUP_DUP: - ret = (atomic_read(&zinfo->active_zones_left) >= 2); + ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); break; } if (ret) break; } + spin_unlock(&fs_info->zone_active_bgs_lock); mutex_unlock(&fs_info->chunk_mutex); if (!ret) @@ -2265,7 +2498,10 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { - /* Now, release this block group for further allocations. */ + /* + * Now, release this block group for further allocations and + * zone finish. + */ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } @@ -2289,7 +2525,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || - (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; } @@ -2365,3 +2602,55 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } + +/* + * Reserve zones for one metadata block group, one tree-log block group, and one + * system block group. + */ +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_block_group *block_group; + struct btrfs_device *device; + /* Reserve zones for normal SINGLE metadata and tree-log block group. */ + unsigned int metadata_reserve = 2; + /* Reserve a zone for SINGLE system block group. */ + unsigned int system_reserve = 1; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return; + + /* + * This function is called from the mount context. So, there is no + * parallel process touching the bits. No need for read_seqretry(). + */ + if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + metadata_reserve = 4; + if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + system_reserve = 2; + + /* Apply the reservation on all the devices. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + device->zone_info->reserved_active_zones = + metadata_reserve + system_reserve; + } + mutex_unlock(&fs_devices->device_list_mutex); + + /* Release reservation for currently active block groups. */ + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + struct map_lookup *map = block_group->physical_map; + + if (!(block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) + continue; + + for (int i = 0; i < map->num_stripes; i++) + map->stripes[i].dev->zone_info->reserved_active_zones--; + } + spin_unlock(&fs_info->zone_active_bgs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 27322b926038..b9cec523b778 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -22,6 +22,11 @@ struct btrfs_zoned_device_info { u8 zone_size_shift; u32 nr_zones; unsigned int max_active_zones; + /* + * Reserved active zones for one metadata and one system block group. + * It can vary per-device depending on the allocation status. + */ + int reserved_active_zones; atomic_t active_zones_left; unsigned long *seq_zones; unsigned long *empty_zones; @@ -58,11 +63,8 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret); -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb); +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); @@ -81,6 +83,7 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -189,17 +192,10 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } -static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) -{ - return true; -} - -static inline void btrfs_revert_meta_write_pointer( - struct btrfs_block_group *cache, - struct extent_buffer *eb) +static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { + return 0; } static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, @@ -262,6 +258,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } +static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index e7ac4ec809a4..5511766485cd 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -145,7 +145,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) } /* - * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds + * Calculate monotonic memory bounds. * * It is possible based on the level configurations that a higher level * workspace uses less memory than a lower level workspace. In order to reuse @@ -218,7 +218,8 @@ void zstd_cleanup_workspace_manager(void) } /* - * zstd_find_workspace - find workspace + * Find workspace for given level. + * * @level: compression level * * This iterates over the set bits in the active_map beginning at the requested @@ -256,7 +257,8 @@ static struct list_head *zstd_find_workspace(unsigned int level) } /* - * zstd_get_workspace - zstd's get_workspace + * Zstd get_workspace for level. + * * @level: compression level * * If @level is 0, then any compression level can be used. Therefore, we begin @@ -296,7 +298,8 @@ again: } /* - * zstd_put_workspace - zstd put_workspace + * Zstd put_workspace. + * * @ws: list_head for the workspace * * When putting back a workspace, we only need to update the LRU if we are of diff --git a/fs/buffer.c b/fs/buffer.c index bd091329026c..967f34b70aa8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,6 +49,7 @@ #include <trace/events/block.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> +#include <linux/sched/isolation.h> #include "internal.h" @@ -281,13 +282,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } while (tmp != bh); spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - /* - * If all of the buffers are uptodate then we can set the page - * uptodate. - */ - if (folio_uptodate) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, folio_uptodate); return; still_busy: @@ -562,12 +557,6 @@ repeat: return err; } -void emergency_thaw_bdev(struct super_block *sb) -{ - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) - printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); -} - /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written @@ -920,16 +909,12 @@ int remove_inode_buffers(struct inode *inode) * which may not fail from ordinary buffer allocations. */ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, - bool retry) + gfp_t gfp) { struct buffer_head *bh, *head; - gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; struct mem_cgroup *memcg, *old_memcg; - if (retry) - gfp |= __GFP_NOFAIL; - /* The folio lock pins the memcg */ memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); @@ -972,7 +957,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { - return folio_alloc_buffers(page_folio(page), size, retry); + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; + if (retry) + gfp |= __GFP_NOFAIL; + + return folio_alloc_buffers(page_folio(page), size, gfp); } EXPORT_SYMBOL_GPL(alloc_page_buffers); @@ -1048,20 +1037,11 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; - gfp_t gfp_mask; - - gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; - - /* - * XXX: __getblk_slow() can not really deal with failure and - * will endlessly loop on improvised global reclaim. Prefer - * looping in the allocator rather than here, at least that - * code knows what it's doing. - */ - gfp_mask |= __GFP_NOFAIL; folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); bh = folio_buffers(folio); if (bh) { @@ -1074,7 +1054,10 @@ grow_dev_page(struct block_device *bdev, sector_t block, goto failed; } - bh = folio_alloc_buffers(folio, size, true); + ret = -ENOMEM; + bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); + if (!bh) + goto failed; /* * Link the folio to the buffers and initialise them. Take the @@ -1225,19 +1208,14 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { - struct super_block *sb; - set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping) mapping_set_error(bh->b_folio->mapping, -EIO); - if (bh->b_assoc_map) + if (bh->b_assoc_map) { mapping_set_error(bh->b_assoc_map, -EIO); - rcu_read_lock(); - sb = READ_ONCE(bh->b_bdev->bd_super); - if (sb) - errseq_set(&sb->s_wb_err, -EIO); - rcu_read_unlock(); + errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO); + } } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1352,7 +1330,7 @@ static void bh_lru_install(struct buffer_head *bh) * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) { + if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) { bh_lru_unlock(); return; } @@ -1382,6 +1360,10 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) check_irqs_on(); bh_lru_lock(); + if (cpu_is_isolated(smp_processor_id())) { + bh_lru_unlock(); + return NULL; + } for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); @@ -1426,33 +1408,36 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) } EXPORT_SYMBOL(__find_get_block); -/* - * __getblk_gfp() will locate (and, if necessary, create) the buffer_head - * which corresponds to the passed block_device, block and size. The - * returned buffer has its reference count incremented. +/** + * bdev_getblk - Get a buffer_head in a block device's buffer cache. + * @bdev: The block device. + * @block: The block number. + * @size: The size of buffer_heads for this @bdev. + * @gfp: The memory allocation flags to use. * - * __getblk_gfp() will lock up the machine if grow_dev_page's - * try_to_free_buffers() attempt is failing. FIXME, perhaps? + * Return: The buffer head, or NULL if memory could not be allocated. */ -struct buffer_head * -__getblk_gfp(struct block_device *bdev, sector_t block, - unsigned size, gfp_t gfp) +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh = __find_get_block(bdev, block, size); - might_sleep(); - if (bh == NULL) - bh = __getblk_slow(bdev, block, size, gfp); - return bh; + might_alloc(gfp); + if (bh) + return bh; + + return __getblk_slow(bdev, block, size, gfp); } -EXPORT_SYMBOL(__getblk_gfp); +EXPORT_SYMBOL(bdev_getblk); /* * Do async read-ahead on a buffer.. */ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = bdev_getblk(bdev, block, size, + GFP_NOWAIT | __GFP_MOVABLE); + if (likely(bh)) { bh_readahead(bh, REQ_RAHEAD); brelse(bh); @@ -1476,7 +1461,17 @@ struct buffer_head * __bread_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); + struct buffer_head *bh; + + gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); + + /* + * Prefer looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp |= __GFP_NOFAIL; + + bh = bdev_getblk(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); @@ -1539,21 +1534,6 @@ void invalidate_bh_lrus_cpu(void) bh_lru_unlock(); } -void set_bh_page(struct buffer_head *bh, - struct page *page, unsigned long offset) -{ - bh->b_page = page; - BUG_ON(offset >= PAGE_SIZE); - if (PageHighMem(page)) - /* - * This catches illegal uses and preserves the offset: - */ - bh->b_data = (char *)(0 + offset); - else - bh->b_data = page_address(page) + offset; -} -EXPORT_SYMBOL(set_bh_page); - void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset) { @@ -1661,12 +1641,13 @@ EXPORT_SYMBOL(block_invalidate_folio); * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the folio lock. */ -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, - unsigned long b_state) +struct buffer_head *create_empty_buffers(struct folio *folio, + unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; - head = folio_alloc_buffers(folio, blocksize, true); + head = folio_alloc_buffers(folio, blocksize, gfp); bh = head; do { bh->b_state |= b_state; @@ -1688,13 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, } folio_attach_private(folio, head); spin_unlock(&folio->mapping->private_lock); -} -EXPORT_SYMBOL(folio_create_empty_buffers); -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) -{ - folio_create_empty_buffers(page_folio(page), blocksize, b_state); + return head; } EXPORT_SYMBOL(create_empty_buffers); @@ -1789,13 +1765,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, struct inode *inode, unsigned int b_state) { + struct buffer_head *bh; + BUG_ON(!folio_test_locked(folio)); - if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, - 1 << READ_ONCE(inode->i_blkbits), - b_state); - return folio_buffers(folio); + bh = folio_buffers(folio); + if (!bh) + bh = create_empty_buffers(folio, + 1 << READ_ONCE(inode->i_blkbits), b_state); + return bh; } /* @@ -2032,7 +2010,7 @@ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) } EXPORT_SYMBOL(folio_zero_new_buffers); -static void +static int iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, const struct iomap *iomap) { @@ -2046,7 +2024,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, * current block, then do not map the buffer and let the caller * handle it. */ - BUG_ON(offset >= iomap->offset + iomap->length); + if (offset >= iomap->offset + iomap->length) + return -EIO; switch (iomap->type) { case IOMAP_HOLE: @@ -2058,7 +2037,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) set_buffer_new(bh); - break; + return 0; case IOMAP_DELALLOC: if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) @@ -2066,7 +2045,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, set_buffer_uptodate(bh); set_buffer_mapped(bh); set_buffer_delay(bh); - break; + return 0; case IOMAP_UNWRITTEN: /* * For unwritten regions, we always need to ensure that regions @@ -2078,12 +2057,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) || - offset >= i_size_read(inode)) + offset >= i_size_read(inode)) { + /* + * This can happen if truncating the block device races + * with the check in the caller as i_size updates on + * block devices aren't synchronized by i_rwsem for + * block devices. + */ + if (S_ISBLK(inode->i_mode)) + return -EIO; set_buffer_new(bh); + } bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> inode->i_blkbits; set_buffer_mapped(bh); - break; + return 0; + default: + WARN_ON_ONCE(1); + return -EIO; } } @@ -2124,13 +2115,12 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - if (get_block) { + if (get_block) err = get_block(inode, block, bh, 1); - if (err) - break; - } else { - iomap_to_bh(inode, block, bh, iomap); - } + else + err = iomap_to_bh(inode, block, bh, iomap); + if (err) + break; if (buffer_new(bh)) { clean_bdev_bh_alias(bh); @@ -2180,8 +2170,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static int __block_commit_write(struct inode *inode, struct folio *folio, - size_t from, size_t to) +static void __block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2216,7 +2205,6 @@ static int __block_commit_write(struct inode *inode, struct folio *folio, */ if (!partial) folio_mark_uptodate(folio); - return 0; } /* @@ -2253,7 +2241,6 @@ int block_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); - struct inode *inode = mapping->host; size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { @@ -2277,7 +2264,7 @@ int block_write_end(struct file *file, struct address_space *mapping, flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(inode, folio, start, start + copied); + __block_commit_write(folio, start, start + copied); return copied; } @@ -2437,12 +2424,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (!nr) { /* - * All buffers are uptodate - we can set the folio uptodate - * as well. But not if get_block() returned an error. + * All buffers are uptodate or get_block() returned an + * error when trying to map them - we can finish the read. */ - if (!page_error) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, !page_error); return 0; } @@ -2598,12 +2583,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(cont_write_begin); -int block_commit_write(struct page *page, unsigned from, unsigned to) +void block_commit_write(struct page *page, unsigned from, unsigned to) { struct folio *folio = page_folio(page); - struct inode *inode = folio->mapping->host; - __block_commit_write(inode, folio, from, to); - return 0; + __block_commit_write(folio, from, to); } EXPORT_SYMBOL(block_commit_write); @@ -2649,11 +2632,11 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, end = size - folio_pos(folio); ret = __block_write_begin_int(folio, 0, end, get_block, NULL); - if (!ret) - ret = __block_commit_write(inode, folio, 0, end); - - if (unlikely(ret < 0)) + if (unlikely(ret)) goto out_unlock; + + __block_commit_write(folio, 0, end); + folio_mark_dirty(folio); folio_wait_stable(folio); return 0; @@ -2690,10 +2673,8 @@ int block_truncate_page(struct address_space *mapping, return PTR_ERR(folio); bh = folio_buffers(folio); - if (!bh) { - folio_create_empty_buffers(folio, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); @@ -3002,13 +2983,13 @@ EXPORT_SYMBOL(try_to_free_buffers); /* * Buffer-head allocation */ -static struct kmem_cache *bh_cachep __read_mostly; +static struct kmem_cache *bh_cachep __ro_after_init; /* * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */ -static unsigned long max_buffer_heads; +static unsigned long max_buffer_heads __ro_after_init; int buffer_heads_over_limit; diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 175a25fcade8..009d23cd435b 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - /* Tell lockdep we inherited freeze protection from submission thread */ - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object, { struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; - struct inode *inode; unsigned int old_nofs; ssize_t ret; size_t len = iov_iter_count(iter); @@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - /* Open-code file_start_write here to grab freeze protection, which - * will be released by another thread in aio_complete_rw(). Fool - * lockdep by telling it the lock got released so that it doesn't - * complain about the held lock when we return to userspace. - */ - inode = file_inode(file); - __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); - __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + kiocb_start_write(&ki->iocb); get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); - trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); + trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len); old_nofs = memalloc_nofs_save(); ret = cachefiles_inject_write_error(); if (ret == 0) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d9d22d0ec38a..7bf7a5fcc045 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, if (ret < 0) goto check_failed; + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); + object->file = file; /* Always update the atime on an object we've just looked up (this is diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 50c635dc7f71..1f77ca04c426 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -12,3 +12,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o +ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 6945a938d396..1564eacc253d 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -15,6 +15,7 @@ #include <linux/slab.h> #include "super.h" +#include "mds_client.h" static inline void ceph_set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) @@ -31,6 +32,7 @@ static inline void ceph_set_cached_acl(struct inode *inode, struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu) { + struct ceph_client *cl = ceph_inode_to_client(inode); int size; unsigned int retry_cnt = 0; const char *name; @@ -72,8 +74,8 @@ retry: } else if (size == -ENODATA || size == 0) { acl = NULL; } else { - pr_err_ratelimited("get acl %llx.%llx failed, err=%d\n", - ceph_vinop(inode), size); + pr_err_ratelimited_client(cl, "%llx.%llx failed, err=%d\n", + ceph_vinop(inode), size); acl = ERR_PTR(-EIO); } @@ -93,7 +95,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, char *value = NULL; struct iattr newattrs; struct inode *inode = d_inode(dentry); - struct timespec64 old_ctime = inode->i_ctime; + struct timespec64 old_ctime = inode_get_ctime(inode); umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -105,7 +107,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_update_mode(&nop_mnt_idmap, inode, + ret = posix_acl_update_mode(idmap, inode, &new_mode, &acl); if (ret) goto out; @@ -140,7 +142,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - ret = __ceph_setattr(inode, &newattrs); + ret = __ceph_setattr(idmap, inode, &newattrs, NULL); if (ret) goto out_free; } @@ -151,7 +153,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = old_ctime; newattrs.ia_mode = old_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - __ceph_setattr(inode, &newattrs); + __ceph_setattr(idmap, inode, &newattrs, NULL); } goto out_free; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 59cbfb80edbd..85be3bf18cdf 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -18,6 +18,7 @@ #include "mds_client.h" #include "cache.h" #include "metric.h" +#include "crypto.h" #include <linux/ceph/osd_client.h> #include <linux/ceph/striper.h> @@ -78,18 +79,18 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page) */ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { - struct inode *inode; + struct inode *inode = mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci; struct ceph_snap_context *snapc; if (folio_test_dirty(folio)) { - dout("%p dirty_folio %p idx %lu -- already dirty\n", - mapping->host, folio, folio->index); + doutc(cl, "%llx.%llx %p idx %lu -- already dirty\n", + ceph_vinop(inode), folio, folio->index); VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } - inode = mapping->host; ci = ceph_inode(inode); /* dirty the head */ @@ -110,12 +111,12 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; - dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d " - "snapc %p seq %lld (%d snaps)\n", - mapping->host, folio, folio->index, - ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, - ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, - snapc, snapc->seq, snapc->num_snaps); + doutc(cl, "%llx.%llx %p idx %lu head %d/%d -> %d/%d " + "snapc %p seq %lld (%d snaps)\n", + ceph_vinop(inode), folio, folio->index, + ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); /* @@ -136,23 +137,22 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) static void ceph_invalidate_folio(struct folio *folio, size_t offset, size_t length) { - struct inode *inode; - struct ceph_inode_info *ci; + struct inode *inode = folio->mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - inode = folio->mapping->host; - ci = ceph_inode(inode); if (offset != 0 || length != folio_size(folio)) { - dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n", - inode, folio->index, offset, length); + doutc(cl, "%llx.%llx idx %lu partial dirty page %zu~%zu\n", + ceph_vinop(inode), folio->index, offset, length); return; } WARN_ON(!folio_test_locked(folio)); if (folio_test_private(folio)) { - dout("%p invalidate_folio idx %lu full dirty page\n", - inode, folio->index); + doutc(cl, "%llx.%llx idx %lu full dirty page\n", + ceph_vinop(inode), folio->index); snapc = folio_detach_private(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); @@ -165,10 +165,10 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, static bool ceph_release_folio(struct folio *folio, gfp_t gfp) { struct inode *inode = folio->mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); - dout("%llx:%llx release_folio idx %lu (%sdirty)\n", - ceph_vinop(inode), - folio->index, folio_test_dirty(folio) ? "" : "not "); + doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode), + folio->index, folio_test_dirty(folio) ? "" : "not "); if (folio_test_private(folio)) return false; @@ -228,7 +228,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq) static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) { struct inode *inode = subreq->rreq->inode; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; u32 xlen; @@ -242,17 +242,20 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) static void finish_netfs_read(struct ceph_osd_request *req) { - struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); + struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct netfs_io_subrequest *subreq = req->r_priv; - int num_pages; + struct ceph_osd_req_op *op = &req->r_ops[0]; int err = req->r_result; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, osd_data->length, err); - dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, - subreq->len, i_size_read(req->r_inode)); + doutc(cl, "result %d subreq->len=%zu i_size=%lld\n", req->r_result, + subreq->len, i_size_read(req->r_inode)); /* no object means success but no data */ if (err == -ENOENT) @@ -260,14 +263,29 @@ static void finish_netfs_read(struct ceph_osd_request *req) else if (err == -EBLOCKLISTED) fsc->blocklisted = true; - if (err >= 0 && err < subreq->len) - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (err >= 0) { + if (sparse && err > 0) + err = ceph_sparse_ext_map_end(op); + if (err < subreq->len) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (IS_ENCRYPTED(inode) && err > 0) { + err = ceph_fscrypt_decrypt_extents(inode, + osd_data->pages, subreq->start, + op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (err > subreq->len) + err = subreq->len; + } + } + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + ceph_put_page_vector(osd_data->pages, + calc_pages_for(osd_data->alignment, + osd_data->length), false); + } netfs_subreq_terminated(subreq, err, false); - - num_pages = calc_pages_for(osd_data->alignment, osd_data->length); - ceph_put_page_vector(osd_data->pages, num_pages, false); iput(req->r_inode); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) @@ -330,14 +348,15 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct netfs_io_request *rreq = subreq->rreq; struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; - struct page **pages; - size_t page_off; int err = 0; u64 len = subreq->len; + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 off = subreq->start; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -347,8 +366,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, - 0, 1, CEPH_OSD_OP_READ, + ceph_fscrypt_adjust_off_and_len(inode, &off, &len); + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, + off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -357,20 +378,49 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) goto out; } - dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); - if (err < 0) { - dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); - goto out; + if (sparse) { + err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + if (err) + goto out; } - /* should always give us a page-aligned read */ - WARN_ON_ONCE(page_off); - len = err; - err = 0; + doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n", + ceph_vinop(inode), subreq->start, subreq->len, len); + + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); + + /* + * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for + * encrypted inodes. We'd need infrastructure that handles an iov_iter + * instead of page arrays, and we don't have that as of yet. Once the + * dust settles on the write helpers and encrypt/decrypt routines for + * netfs, we should be able to rework this. + */ + if (IS_ENCRYPTED(inode)) { + struct page **pages; + size_t page_off; + + err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + if (err < 0) { + doutc(cl, "%llx.%llx failed to allocate pages, %d\n", + ceph_vinop(inode), err); + goto out; + } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + err = 0; + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, + false); + } else { + osd_req_op_extent_osd_iter(req, 0, &iter); + } + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + err = -EIO; + goto out; + } req->r_callback = finish_netfs_read; req->r_priv = subreq; req->r_inode = inode; @@ -381,12 +431,13 @@ out: ceph_osdc_put_request(req); if (err) netfs_subreq_terminated(subreq, err, false); - dout("%s: result %d\n", __func__, err); + doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); } static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) { struct inode *inode = rreq->inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int got = 0, want = CEPH_CAP_FILE_CACHE; struct ceph_netfs_request_data *priv; int ret = 0; @@ -418,12 +469,12 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file) */ ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); if (ret < 0) { - dout("start_read %p, error getting cap\n", inode); + doutc(cl, "%llx.%llx, error getting cap\n", ceph_vinop(inode)); goto out; } if (!(got & want)) { - dout("start_read %p, no cache cap\n", inode); + doutc(cl, "%llx.%llx, no cache cap\n", ceph_vinop(inode)); ret = -EACCES; goto out; } @@ -515,13 +566,14 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, - capsnap->context, capsnap->dirty_pages); + doutc(cl, " capsnap %p snapc %p has %d dirty pages\n", + capsnap, capsnap->context, capsnap->dirty_pages); if (!capsnap->dirty_pages) continue; @@ -553,8 +605,8 @@ get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); - dout(" head snapc %p has %d dirty pages\n", - snapc, ci->i_wrbuffer_ref_head); + doutc(cl, " head snapc %p has %d dirty pages\n", snapc, + ci->i_wrbuffer_ref_head); if (ctl) { ctl->i_size = i_size_read(inode); ctl->truncate_size = ci->i_truncate_size; @@ -571,10 +623,12 @@ static u64 get_writepages_data_length(struct inode *inode, struct page *page, u64 start) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_snap_context *snapc; struct ceph_cap_snap *capsnap = NULL; u64 end = i_size_read(inode); + u64 ret; + snapc = page_snap_context(ceph_fscrypt_pagecache_page(page)); if (snapc != ci->i_head_snapc) { bool found = false; spin_lock(&ci->i_ceph_lock); @@ -589,9 +643,12 @@ static u64 get_writepages_data_length(struct inode *inode, spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } - if (end > page_offset(page) + thp_size(page)) - end = page_offset(page) + thp_size(page); - return end > start ? end - start : 0; + if (end > ceph_fscrypt_page_offset(page) + thp_size(page)) + end = ceph_fscrypt_page_offset(page) + thp_size(page); + ret = end > start ? end - start : 0; + if (ret && fscrypt_is_bounce_page(page)) + ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE); + return ret; } /* @@ -605,17 +662,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); int err; loff_t len = thp_size(page); + loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; bool caching = ceph_is_cache_enabled(inode); + struct page *bounce_page = NULL; - dout("writepage %p idx %lu\n", page, page->index); + doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, + page->index); if (ceph_inode_is_shutdown(inode)) return -EIO; @@ -623,13 +684,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { - dout("writepage %p page %p not dirty?\n", inode, page); + doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), + page); return 0; } oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { - dout("writepage %p page %p snapc %p not writeable - noop\n", - inode, page, snapc); + doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", + ceph_vinop(inode), page, snapc); /* we should only noop if called by kswapd */ WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); @@ -640,8 +702,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { - dout("folio at %lu beyond eof %llu\n", folio->index, - ceph_wbc.i_size); + doutc(cl, "%llx.%llx folio at %lu beyond eof %llu\n", + ceph_vinop(inode), folio->index, ceph_wbc.i_size); folio_invalidate(folio, 0, folio_size(folio)); return 0; } @@ -649,39 +711,61 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", - inode, page, page->index, page_off, len, snapc, snapc->seq); + wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; + doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", + ceph_vinop(inode), page, page->index, page_off, wlen, snapc, + snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = true; - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, - ceph_wbc.truncate_seq, ceph_wbc.truncate_size, - true); + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); if (IS_ERR(req)) { redirty_page_for_writepage(wbc, page); return PTR_ERR(req); } + if (wlen < len) + len = wlen; + set_page_writeback(page); if (caching) ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); + if (IS_ENCRYPTED(inode)) { + bounce_page = fscrypt_encrypt_pagecache_blocks(page, + CEPH_FSCRYPT_BLOCK_SIZE, 0, + GFP_NOFS); + if (IS_ERR(bounce_page)) { + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + ceph_osdc_put_request(req); + return PTR_ERR(bounce_page); + } + } + /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); - dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); - - req->r_mtime = inode->i_mtime; + osd_req_op_extent_osd_data_pages(req, 0, + bounce_page ? &bounce_page : &page, wlen, 0, + false, false); + doutc(cl, "%llx.%llx %llu~%llu (%llu bytes, %sencrypted)\n", + ceph_vinop(inode), page_off, len, wlen, + IS_ENCRYPTED(inode) ? "" : "not "); + + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(osdc, req); err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); - + fscrypt_free_bounce_page(bounce_page); ceph_osdc_put_request(req); if (err == 0) err = len; @@ -692,19 +776,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) wbc = &tmp_wbc; if (err == -ERESTARTSYS) { /* killed by SIGKILL */ - dout("writepage interrupted page %p\n", page); + doutc(cl, "%llx.%llx interrupted page %p\n", + ceph_vinop(inode), page); redirty_page_for_writepage(wbc, page); end_page_writeback(page); return err; } if (err == -EBLOCKLISTED) fsc->blocklisted = true; - dout("writepage setting page/mapping error %d %p\n", - err, page); + doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", + ceph_vinop(inode), err, page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { - dout("writepage cleaned page %p\n", page); + doutc(cl, "%llx.%llx cleaned page %p\n", + ceph_vinop(inode), page); err = 0; /* vfs expects us to return 0 */ } oldest = detach_page_private(page); @@ -728,7 +814,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) ihold(inode); if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_client(inode)->write_congested) + ceph_inode_to_fs_client(inode)->write_congested) return AOP_WRITEPAGE_ACTIVATE; wait_on_page_fscache(page); @@ -754,6 +840,7 @@ static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data; struct page *page; int num_pages, total_pages = 0; @@ -761,11 +848,11 @@ static void writepages_finish(struct ceph_osd_request *req) int rc = req->r_result; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); unsigned int len = 0; bool remove_page; - dout("writepages_finish %p rc %d\n", inode, rc); + doutc(cl, "%llx.%llx rc %d\n", ceph_vinop(inode), rc); if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); @@ -787,8 +874,10 @@ static void writepages_finish(struct ceph_osd_request *req) /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { - pr_warn("%s incorrect op %d req %p index %d tid %llu\n", - __func__, req->r_ops[i].op, req, i, req->r_tid); + pr_warn_client(cl, + "%llx.%llx incorrect op %d req %p index %d tid %llu\n", + ceph_vinop(inode), req->r_ops[i].op, req, i, + req->r_tid); break; } @@ -800,6 +889,11 @@ static void writepages_finish(struct ceph_osd_request *req) total_pages += num_pages; for (j = 0; j < num_pages; j++) { page = osd_data->pages[j]; + if (fscrypt_is_bounce_page(page)) { + page = fscrypt_pagecache_page(page); + fscrypt_free_bounce_page(osd_data->pages[j]); + osd_data->pages[j] = page; + } BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -810,7 +904,7 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); - dout("unlocking %p\n", page); + doutc(cl, "unlocking %p\n", page); if (remove_page) generic_error_remove_page(inode->i_mapping, @@ -818,8 +912,9 @@ static void writepages_finish(struct ceph_osd_request *req) unlock_page(page); } - dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", - inode, osd_data->length, rc >= 0 ? num_pages : 0); + doutc(cl, "%llx.%llx wrote %llu bytes cleaned %d pages\n", + ceph_vinop(inode), osd_data->length, + rc >= 0 ? num_pages : 0); release_pages(osd_data->pages, num_pages); } @@ -835,6 +930,7 @@ static void writepages_finish(struct ceph_osd_request *req) else kfree(osd_data->pages); ceph_osdc_put_request(req); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } /* @@ -845,7 +941,8 @@ static int ceph_writepages_start(struct address_space *mapping, { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; @@ -863,15 +960,15 @@ static int ceph_writepages_start(struct address_space *mapping, fsc->write_congested) return 0; - dout("writepages_start %p (mode=%s)\n", inode, - wbc->sync_mode == WB_SYNC_NONE ? "NONE" : - (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { - pr_warn_ratelimited( - "writepage_start %p %lld forced umount\n", - inode, ceph_ino(inode)); + pr_warn_ratelimited_client(cl, + "%llx.%llx %lld forced umount\n", + ceph_vinop(inode), ceph_ino(inode)); } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ @@ -895,11 +992,11 @@ retry: if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ - dout(" no snap context with dirty data?\n"); + doutc(cl, " no snap context with dirty data?\n"); goto out; } - dout(" oldest snapc is %p seq %lld (%d snaps)\n", - snapc, snapc->seq, snapc->num_snaps); + doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, + snapc->seq, snapc->num_snaps); should_loop = false; if (ceph_wbc.head_snapc && snapc != last_snapc) { @@ -909,13 +1006,13 @@ retry: end = -1; if (index > 0) should_loop = true; - dout(" cyclic, start at %lu\n", index); + doutc(cl, " cyclic, start at %lu\n", index); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = true; - dout(" not cyclic, %lu to %lu\n", index, end); + doutc(cl, " not cyclic, %lu to %lu\n", index, end); } } else if (!ceph_wbc.head_snapc) { /* Do not respect wbc->range_{start,end}. Dirty pages @@ -924,7 +1021,7 @@ retry: * associated with 'snapc' get written */ if (index > 0) should_loop = true; - dout(" non-head snapc, range whole\n"); + doutc(cl, " non-head snapc, range whole\n"); } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) @@ -947,12 +1044,12 @@ retry: get_more_pages: nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); - dout("pagevec_lookup_range_tag got %d\n", nr_folios); + doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); if (!nr_folios && !locked_pages) break; for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { page = &fbatch.folios[i]->page; - dout("? %p idx %lu\n", page, page->index); + doutc(cl, "? %p idx %lu\n", page, page->index); if (locked_pages == 0) lock_page(page); /* first page */ else if (!trylock_page(page)) @@ -961,15 +1058,15 @@ get_more_pages: /* only dirty pages, or our accounting breaks */ if (unlikely(!PageDirty(page)) || unlikely(page->mapping != mapping)) { - dout("!dirty or !mapping %p\n", page); + doutc(cl, "!dirty or !mapping %p\n", page); unlock_page(page); continue; } /* only if matching snap context */ pgsnapc = page_snap_context(page); if (pgsnapc != snapc) { - dout("page snapc %p %lld != oldest %p %lld\n", - pgsnapc, pgsnapc->seq, snapc, snapc->seq); + doutc(cl, "page snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); if (!should_loop && !ceph_wbc.head_snapc && wbc->sync_mode != WB_SYNC_NONE) @@ -980,8 +1077,8 @@ get_more_pages: if (page_offset(page) >= ceph_wbc.i_size) { struct folio *folio = page_folio(page); - dout("folio at %lu beyond eof %llu\n", - folio->index, ceph_wbc.i_size); + doutc(cl, "folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc.i_size); if ((ceph_wbc.size_stable || folio_pos(folio) >= i_size_read(inode)) && folio_clear_dirty_for_io(folio)) @@ -991,23 +1088,23 @@ get_more_pages: continue; } if (strip_unit_end && (page->index > strip_unit_end)) { - dout("end of strip unit %p\n", page); + doutc(cl, "end of strip unit %p\n", page); unlock_page(page); break; } if (PageWriteback(page) || PageFsCache(page)) { if (wbc->sync_mode == WB_SYNC_NONE) { - dout("%p under writeback\n", page); + doutc(cl, "%p under writeback\n", page); unlock_page(page); continue; } - dout("waiting on writeback %p\n", page); + doutc(cl, "waiting on writeback %p\n", page); wait_on_page_writeback(page); wait_on_page_fscache(page); } if (!clear_page_dirty_for_io(page)) { - dout("%p !clear_page_dirty_for_io\n", page); + doutc(cl, "%p !clear_page_dirty_for_io\n", page); unlock_page(page); continue; } @@ -1062,17 +1159,37 @@ get_more_pages: } /* note position of first page in fbatch */ - dout("%p will write page %p idx %lu\n", - inode, page, page->index); + doutc(cl, "%llx.%llx will write page %p idx %lu\n", + ceph_vinop(inode), page, page->index); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( fsc->mount_options->congestion_kb)) fsc->write_congested = true; - pages[locked_pages++] = page; - fbatch.folios[i] = NULL; + if (IS_ENCRYPTED(inode)) { + pages[locked_pages] = + fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, + locked_pages ? GFP_NOWAIT : GFP_NOFS); + if (IS_ERR(pages[locked_pages])) { + if (PTR_ERR(pages[locked_pages]) == -EINVAL) + pr_err_client(cl, + "inode->i_blkbits=%hhu\n", + inode->i_blkbits); + /* better not fail on first page! */ + BUG_ON(locked_pages == 0); + pages[locked_pages] = NULL; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + break; + } + ++locked_pages; + } else { + pages[locked_pages++] = page; + } + fbatch.folios[i] = NULL; len += thp_size(page); } @@ -1093,14 +1210,14 @@ get_more_pages: if (nr_folios && i == nr_folios && locked_pages < max_pages) { - dout("reached end fbatch, trying for more\n"); + doutc(cl, "reached end fbatch, trying for more\n"); folio_batch_release(&fbatch); goto get_more_pages; } } new_request: - offset = page_offset(pages[0]); + offset = ceph_fscrypt_page_offset(pages[0]); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, @@ -1121,9 +1238,13 @@ new_request: ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } - BUG_ON(len < page_offset(pages[locked_pages - 1]) + - thp_size(page) - offset); + BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + + thp_size(pages[locked_pages - 1]) - offset); + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto release_folios; + } req->r_callback = writepages_finish; req->r_inode = inode; @@ -1132,7 +1253,9 @@ new_request: data_pages = pages; op_idx = 0; for (i = 0; i < locked_pages; i++) { - u64 cur_offset = page_offset(pages[i]); + struct page *page = ceph_fscrypt_pagecache_page(pages[i]); + + u64 cur_offset = page_offset(page); /* * Discontinuity in page range? Ceph can handle that by just passing * multiple extents in the write op. @@ -1148,8 +1271,8 @@ new_request: /* Start a new extent */ osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); - dout("writepages got pages at %llu~%llu\n", - offset, len); + doutc(cl, "got pages at %llu~%llu\n", offset, + len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); @@ -1161,9 +1284,9 @@ new_request: op_idx++; } - set_page_writeback(pages[i]); + set_page_writeback(page); if (caching) - ceph_set_page_fscache(pages[i]); + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); @@ -1179,7 +1302,16 @@ new_request: offset); len = max(len, min_len); } - dout("writepages got pages at %llu~%llu\n", offset, len); + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) + pr_warn_client(cl, + "bad encrypted write offset=%lld len=%llu\n", + offset, len); osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); @@ -1213,7 +1345,7 @@ new_request: pages = NULL; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); req = NULL; @@ -1231,14 +1363,14 @@ new_request: done = true; release_folios: - dout("folio_batch release on %d folios (%p)\n", (int)fbatch.nr, - fbatch.nr ? fbatch.folios[0] : NULL); + doutc(cl, "folio_batch release on %d folios (%p)\n", + (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); folio_batch_release(&fbatch); } if (should_loop && !done) { /* more to do; loop back to beginning of file */ - dout("writepages looping back to beginning of file\n"); + doutc(cl, "looping back to beginning of file\n"); end = start_index - 1; /* OK even when start_index == 0 */ /* to write dirty pages associated with next snapc, @@ -1276,7 +1408,8 @@ release_folios: out: ceph_osdc_put_request(req); ceph_put_snap_context(last_snapc); - dout("writepages dend - startone, rc = %d\n", rc); + doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), + rc); return rc; } @@ -1310,11 +1443,12 @@ static struct ceph_snap_context * ceph_find_incompatible(struct page *page) { struct inode *inode = page->mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (ceph_inode_is_shutdown(inode)) { - dout(" page %p %llx:%llx is shutdown\n", page, - ceph_vinop(inode)); + doutc(cl, " %llx.%llx page %p is shutdown\n", + ceph_vinop(inode), page); return ERR_PTR(-ESTALE); } @@ -1335,13 +1469,15 @@ ceph_find_incompatible(struct page *page) if (snapc->seq > oldest->seq) { /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); - dout(" page %p snapc %p not current or oldest\n", page, snapc); + doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", + ceph_vinop(inode), page, snapc); return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); /* yay, writeable, do it now (without dropping page lock) */ - dout(" page %p snapc %p not current, but oldest\n", page, snapc); + doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", + ceph_vinop(inode), page, snapc); if (clear_page_dirty_for_io(page)) { int r = writepage_nounlock(page, NULL); if (r < 0) @@ -1410,10 +1546,11 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, { struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); + struct ceph_client *cl = ceph_inode_to_client(inode); bool check_cap = false; - dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file, - inode, folio, (int)pos, (int)copied, (int)len); + doutc(cl, "%llx.%llx file %p folio %p %d~%d (%d)\n", ceph_vinop(inode), + file, folio, (int)pos, (int)copied, (int)len); if (!folio_test_uptodate(folio)) { /* just return that nothing was copied on a short copy */ @@ -1473,6 +1610,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_file_info *fi = vma->vm_file->private_data; loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; @@ -1484,8 +1622,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_block_sigs(&oldset); - dout("filemap_fault %p %llx.%llx %llu trying to get caps\n", - inode, ceph_vinop(inode), off); + doutc(cl, "%llx.%llx %llu trying to get caps\n", + ceph_vinop(inode), off); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else @@ -1496,8 +1634,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) if (err < 0) goto out_restore; - dout("filemap_fault %p %llu got cap refs on %s\n", - inode, off, ceph_cap_string(got)); + doutc(cl, "%llx.%llx %llu got cap refs on %s\n", ceph_vinop(inode), + off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || !ceph_has_inline_data(ci)) { @@ -1505,8 +1643,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); ceph_del_rw_context(fi, &rw_ctx); - dout("filemap_fault %p %llu drop cap refs %s ret %x\n", - inode, off, ceph_cap_string(got), ret); + doutc(cl, "%llx.%llx %llu drop cap refs %s ret %x\n", + ceph_vinop(inode), off, ceph_cap_string(got), ret); } else err = -EAGAIN; @@ -1547,8 +1685,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: filemap_invalidate_unlock_shared(mapping); - dout("filemap_fault %p %llu read inline data ret %x\n", - inode, off, ret); + doutc(cl, "%llx.%llx %llu read inline data ret %x\n", + ceph_vinop(inode), off, ret); } out_restore: ceph_restore_sigs(&oldset); @@ -1562,6 +1700,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_cap_flush *prealloc_cf; @@ -1588,8 +1727,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) else len = offset_in_thp(page, size); - dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", - inode, ceph_vinop(inode), off, len, size); + doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", + ceph_vinop(inode), off, len, size); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else @@ -1600,8 +1739,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) if (err < 0) goto out_free; - dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", - inode, off, len, ceph_cap_string(got)); + doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), + off, len, ceph_cap_string(got)); /* Update time before taking page lock */ file_update_time(vma->vm_file); @@ -1649,8 +1788,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) __mark_inode_dirty(inode, dirty); } - dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n", - inode, off, len, ceph_cap_string(got), ret); + doutc(cl, "%llx.%llx %llu~%zd dropping cap refs on %s ret %x\n", + ceph_vinop(inode), off, len, ceph_cap_string(got), ret); ceph_put_cap_refs_async(ci, got); out_free: ceph_restore_sigs(&oldset); @@ -1664,6 +1803,7 @@ out_free: void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct address_space *mapping = inode->i_mapping; struct page *page; @@ -1684,8 +1824,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } } - dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n", - inode, ceph_vinop(inode), len, locked_page); + doutc(cl, "%p %llx.%llx len %zu locked_page %p\n", inode, + ceph_vinop(inode), len, locked_page); if (len > 0) { void *kaddr = kmap_atomic(page); @@ -1709,7 +1849,8 @@ int ceph_uninline_data(struct file *file) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_osd_request *req = NULL; struct ceph_cap_flush *prealloc_cf = NULL; struct folio *folio = NULL; @@ -1722,8 +1863,8 @@ int ceph_uninline_data(struct file *file) inline_version = ci->i_inline_version; spin_unlock(&ci->i_ceph_lock); - dout("uninline_data %p %llx.%llx inline_version %llu\n", - inode, ceph_vinop(inode), inline_version); + doutc(cl, "%llx.%llx inline_version %llu\n", ceph_vinop(inode), + inline_version); if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -1761,7 +1902,7 @@ int ceph_uninline_data(struct file *file) goto out_unlock; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); @@ -1803,7 +1944,7 @@ int ceph_uninline_data(struct file *file) goto out_put_req; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1835,8 +1976,8 @@ out_unlock: } out: ceph_free_cap_flush(prealloc_cf); - dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", - inode, ceph_vinop(inode), inline_version, err); + doutc(cl, "%llx.%llx inline_version %llu = %d\n", + ceph_vinop(inode), inline_version, err); return err; } @@ -1863,8 +2004,9 @@ enum { static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool, struct ceph_string *pool_ns) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(&ci->netfs.inode); struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client *cl = fsc->client; struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; struct rb_node **p, *parent; struct ceph_pool_perm *perm; @@ -1899,10 +2041,10 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, goto out; if (pool_ns) - dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n", - pool, (int)pool_ns->len, pool_ns->str); + doutc(cl, "pool %lld ns %.*s no perm cached\n", pool, + (int)pool_ns->len, pool_ns->str); else - dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool); + doutc(cl, "pool %lld no perm cached\n", pool); down_write(&mdsc->pool_perm_rwsem); p = &mdsc->pool_perm_tree.rb_node; @@ -1978,7 +2120,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); ceph_osdc_start_request(&fsc->client->osdc, rd_req); - wr_req->r_mtime = ci->netfs.inode.i_mtime; + wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode); ceph_osdc_start_request(&fsc->client->osdc, wr_req); err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); @@ -2027,15 +2169,16 @@ out: if (!err) err = have; if (pool_ns) - dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n", - pool, (int)pool_ns->len, pool_ns->str, err); + doutc(cl, "pool %lld ns %.*s result = %d\n", pool, + (int)pool_ns->len, pool_ns->str, err); else - dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err); + doutc(cl, "pool %lld result = %d\n", pool, err); return err; } int ceph_pool_perm_check(struct inode *inode, int need) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; s64 pool; @@ -2054,7 +2197,7 @@ int ceph_pool_perm_check(struct inode *inode, int need) return 0; } - if (ceph_test_mount_opt(ceph_inode_to_client(inode), + if (ceph_test_mount_opt(ceph_inode_to_fs_client(inode), NOPOOLPERM)) return 0; @@ -2065,13 +2208,11 @@ int ceph_pool_perm_check(struct inode *inode, int need) check: if (flags & CEPH_I_POOL_PERM) { if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { - dout("ceph_pool_perm_check pool %lld no read perm\n", - pool); + doutc(cl, "pool %lld no read perm\n", pool); return -EPERM; } if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { - dout("ceph_pool_perm_check pool %lld no write perm\n", - pool); + doutc(cl, "pool %lld no write perm\n", pool); return -EPERM; } return 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 177d8e8d73fe..930fbd54d2c8 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -15,7 +15,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); /* No caching for filesystem? */ if (!fsc->fscache) @@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), i_size_read(inode)); + if (ci->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e2bb0d0072da..2c0b8dc3dd0d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -14,6 +14,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> @@ -185,10 +186,10 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) mdsc->caps_avail_count += nr_caps; } - dout("%s: caps %d = %d used + %d resv + %d avail\n", - __func__, - mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); + doutc(mdsc->fsc->client, + "caps %d = %d used + %d resv + %d avail\n", + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); @@ -201,6 +202,7 @@ static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { + struct ceph_client *cl = mdsc->fsc->client; int i, j; struct ceph_cap *cap; int have; @@ -211,7 +213,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *s; LIST_HEAD(newcaps); - dout("reserve caps ctx=%p need=%d\n", ctx, need); + doutc(cl, "ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ spin_lock(&mdsc->caps_list_lock); @@ -271,8 +273,8 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, continue; } - pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have + alloc); + pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need, + have + alloc); err = -ENOMEM; break; } @@ -297,20 +299,21 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, spin_unlock(&mdsc->caps_list_lock); - dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", - ctx, mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); + doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx, + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); return err; } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { + struct ceph_client *cl = mdsc->fsc->client; bool reclaim = false; if (!ctx->count) return; - dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); + doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; @@ -327,6 +330,7 @@ void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ @@ -358,9 +362,9 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, } spin_lock(&mdsc->caps_list_lock); - dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", - ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); + doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx, + ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); BUG_ON(ctx->count > mdsc->caps_reserve_count); BUG_ON(list_empty(&mdsc->caps_list)); @@ -381,10 +385,12 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { + struct ceph_client *cl = mdsc->fsc->client; + spin_lock(&mdsc->caps_list_lock); - dout("put_cap %p %d = %d used + %d resv + %d avail\n", - cap, mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); + doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap, + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to @@ -490,11 +496,13 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { + struct inode *inode = &ci->netfs.inode; struct ceph_mount_options *opt = mdsc->fsc->mount_options; + ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode, - ci->i_hold_caps_max - jiffies); + doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode, + ceph_vinop(inode), ci->i_hold_caps_max - jiffies); } /* @@ -508,8 +516,11 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode, - ci->i_ceph_flags, ci->i_hold_caps_max); + struct inode *inode = &ci->netfs.inode; + + doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n", + inode, ceph_vinop(inode), ci->i_ceph_flags, + ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); if (!list_empty(&ci->i_cap_delay_list)) { @@ -532,7 +543,9 @@ no_change: static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode); + struct inode *inode = &ci->netfs.inode; + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) @@ -549,7 +562,9 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - dout("__cap_delay_cancel %p\n", &ci->netfs.inode); + struct inode *inode = &ci->netfs.inode; + + doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); @@ -561,6 +576,9 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + unsigned had = __ceph_caps_issued(ci, NULL); lockdep_assert_held(&ci->i_ceph_lock); @@ -585,7 +603,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); if (S_ISDIR(ci->netfs.inode.i_mode)) { - dout(" marking %p NOT complete\n", &ci->netfs.inode); + doutc(cl, " marking %p NOT complete\n", inode); __ceph_dir_clear_complete(ci); } } @@ -634,7 +652,8 @@ void ceph_add_cap(struct inode *inode, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap **new_cap) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; int mds = session->s_mds; @@ -643,8 +662,9 @@ void ceph_add_cap(struct inode *inode, lockdep_assert_held(&ci->i_ceph_lock); - dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, - session->s_mds, cap_id, ceph_cap_string(issued), seq); + doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode, + ceph_vinop(inode), session->s_mds, cap_id, + ceph_cap_string(issued), seq); gen = atomic_read(&session->s_cap_gen); @@ -722,9 +742,9 @@ void ceph_add_cap(struct inode *inode, actual_wanted = __ceph_caps_wanted(ci); if ((wanted & ~actual_wanted) || (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { - dout(" issued %s, mds wanted %s, actual %s, queueing\n", - ceph_cap_string(issued), ceph_cap_string(wanted), - ceph_cap_string(actual_wanted)); + doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n", + ceph_cap_string(issued), ceph_cap_string(wanted), + ceph_cap_string(actual_wanted)); __cap_delay_requeue(mdsc, ci); } @@ -741,9 +761,9 @@ void ceph_add_cap(struct inode *inode, WARN_ON(ci->i_auth_cap == cap); } - dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", - inode, ceph_vinop(inode), cap, ceph_cap_string(issued), - ceph_cap_string(issued|cap->issued), seq, mds); + doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(issued), + ceph_cap_string(issued|cap->issued), seq, mds); cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; @@ -765,6 +785,8 @@ void ceph_add_cap(struct inode *inode, */ static int __cap_is_valid(struct ceph_cap *cap) { + struct inode *inode = &cap->ci->netfs.inode; + struct ceph_client *cl = cap->session->s_mdsc->fsc->client; unsigned long ttl; u32 gen; @@ -772,9 +794,9 @@ static int __cap_is_valid(struct ceph_cap *cap) ttl = cap->session->s_cap_ttl; if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { - dout("__cap_is_valid %p cap %p issued %s " - "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode, - cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); + doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } @@ -788,6 +810,8 @@ static int __cap_is_valid(struct ceph_cap *cap) */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; @@ -798,8 +822,8 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - dout("__ceph_caps_issued %p cap %p issued %s\n", - &ci->netfs.inode, cap, ceph_cap_string(cap->issued)); + doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode, + ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; @@ -842,16 +866,18 @@ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) */ static void __touch_cap(struct ceph_cap *cap) { + struct inode *inode = &cap->ci->netfs.inode; struct ceph_mds_session *s = cap->session; + struct ceph_client *cl = s->s_mdsc->fsc->client; spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { - dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap, - s->s_mds); + doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode, + ceph_vinop(inode), cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { - dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n", - &cap->ci->netfs.inode, cap, s->s_mds); + doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n", + inode, ceph_vinop(inode), cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } @@ -863,15 +889,16 @@ static void __touch_cap(struct ceph_cap *cap) */ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct rb_node *p; int have = ci->i_snap_caps; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s" - " (mask %s)\n", ceph_ino(&ci->netfs.inode), - ceph_cap_string(have), - ceph_cap_string(mask)); + doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n", + inode, ceph_vinop(inode), ceph_cap_string(have), + ceph_cap_string(mask)); return 1; } @@ -880,10 +907,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s" - " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap, - ceph_cap_string(cap->issued), - ceph_cap_string(mask)); + doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); if (touch) __touch_cap(cap); return 1; @@ -892,10 +919,10 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { - dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s" - " (mask %s)\n", ceph_ino(&ci->netfs.inode), - ceph_cap_string(cap->issued), - ceph_cap_string(mask)); + doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n", + inode, ceph_vinop(inode), + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); if (touch) { struct rb_node *q; @@ -921,7 +948,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); @@ -953,13 +980,14 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) { struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int ret; spin_lock(&ci->i_ceph_lock); ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); - dout("ceph_caps_revoking %p %s = %d\n", inode, - ceph_cap_string(mask), ret); + doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode), + ceph_cap_string(mask), ret); return ret; } @@ -995,7 +1023,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->netfs.inode)->mount_options; + ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; @@ -1106,21 +1134,23 @@ int ceph_is_any_caps(struct inode *inode) void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; + struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_inode_info *ci = cap->ci; + struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc; int removed = 0; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { - dout("%s: cap inode is NULL\n", __func__); + doutc(cl, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); - dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode); + doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode)); - mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc; + mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); @@ -1131,8 +1161,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ - dout("__ceph_remove_cap delaying %p removal from session %p\n", - cap, cap->session); + doutc(cl, "delaying %p removal from session %p\n", cap, + cap->session); } else { list_del_init(&cap->session_caps); session->s_nr_caps--; @@ -1177,20 +1207,21 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } } -void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) +void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + bool queue_release) { struct ceph_inode_info *ci = cap->ci; struct ceph_fs_client *fsc; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { - dout("%s: cap inode is NULL\n", __func__); + doutc(mdsc->fsc->client, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); - fsc = ceph_inode_to_client(&ci->netfs.inode); + fsc = ceph_inode_to_fs_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && @@ -1216,31 +1247,31 @@ struct cap_msg_args { umode_t mode; bool inline_data; bool wake; + bool encrypted; + u32 fscrypt_auth_len; + u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context }; -/* - * cap struct size + flock buffer size + inline version + inline data size + - * osd_epoch_barrier + oldest_flush_tid - */ -#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ - 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) - /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { struct ceph_mds_caps *fc; void *p; - struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; - - dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", - __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, - ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), - ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, - arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, - arg->size, arg->max_size, arg->xattr_version, - arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); - - msg->hdr.version = cpu_to_le16(10); + struct ceph_mds_client *mdsc = arg->session->s_mdsc; + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + + doutc(mdsc->fsc->client, + "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u" + " tid %llu/%llu mseq %u follows %lld size %llu/%llu" + " xattr_ver %llu xattr_len %d\n", + ceph_cap_op_name(arg->op), arg->cid, arg->ino, + ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), + ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, + arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, + arg->size, arg->max_size, arg->xattr_version, + arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); + + msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; @@ -1257,7 +1288,13 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); - fc->size = cpu_to_le64(arg->size); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (arg->encrypted) + fc->size = cpu_to_le64(round_up(arg->size, + CEPH_FSCRYPT_BLOCK_SIZE)); + else +#endif + fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); @@ -1311,6 +1348,27 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); + + /* dirstats (version 11) - these are r/o on the client */ + ceph_encode_64(&p, 0); + ceph_encode_64(&p, 0); + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + /* + * fscrypt_auth and fscrypt_file (version 12) + * + * fscrypt_auth holds the crypto context (if any). fscrypt_file + * tracks the real i_size as an __le64 field (and we use a rounded-up + * i_size in the traditional size field). + */ + ceph_encode_32(&p, arg->fscrypt_auth_len); + ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); + ceph_encode_32(&p, sizeof(__le64)); + ceph_encode_64(&p, arg->size); +#else /* CONFIG_FS_ENCRYPTION */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); +#endif /* CONFIG_FS_ENCRYPTION */ } /* @@ -1318,6 +1376,8 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) */ void __ceph_remove_caps(struct ceph_inode_info *ci) { + struct inode *inode = &ci->netfs.inode; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct rb_node *p; /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) @@ -1327,7 +1387,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); - ceph_remove_cap(cap, true); + ceph_remove_cap(mdsc, cap, true); } spin_unlock(&ci->i_ceph_lock); } @@ -1346,6 +1406,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); @@ -1354,10 +1415,10 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, revoking = cap->implemented & ~cap->issued; retain &= ~revoking; - dout("%s %p cap %p session %p %s -> %s (revoking %s)\n", - __func__, inode, cap, cap->session, - ceph_cap_string(held), ceph_cap_string(held & retain), - ceph_cap_string(revoking)); + doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n", + inode, ceph_vinop(inode), cap, cap->session, + ceph_cap_string(held), ceph_cap_string(held & retain), + ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); ci->i_ceph_flags &= ~CEPH_I_FLUSH; @@ -1378,7 +1439,6 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; - arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; @@ -1398,9 +1458,9 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->old_xattr_buf = NULL; } - arg->mtime = inode->i_mtime; - arg->atime = inode->i_atime; - arg->ctime = inode->i_ctime; + arg->mtime = inode_get_mtime(inode); + arg->atime = inode_get_atime(inode); + arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); @@ -1432,7 +1492,38 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, } } arg->flags = flags; + arg->encrypted = IS_ENCRYPTED(inode); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len && + WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { + /* Don't set this if it's too big */ + arg->fscrypt_auth_len = 0; + } else { + arg->fscrypt_auth_len = ci->fscrypt_auth_len; + memcpy(arg->fscrypt_auth, ci->fscrypt_auth, + min_t(size_t, ci->fscrypt_auth_len, + sizeof(arg->fscrypt_auth))); + } +#endif /* CONFIG_FS_ENCRYPTION */ +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) + +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; +} +#else +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS; } +#endif /* CONFIG_FS_ENCRYPTION */ /* * Send a cap msg on the given inode. @@ -1443,12 +1534,16 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, + false); if (!msg) { - pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", - ceph_vinop(inode), ceph_cap_string(arg->dirty), - arg->flush_tid); + pr_err_client(cl, + "error allocating cap msg: ino (%llx.%llx)" + " flushing %s tid %llu, requeuing cap.\n", + ceph_vinop(inode), ceph_cap_string(arg->dirty), + arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); @@ -1470,10 +1565,6 @@ static inline int __send_flush_snap(struct inode *inode, struct cap_msg_args arg; struct ceph_msg *msg; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); - if (!msg) - return -ENOMEM; - arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; @@ -1510,6 +1601,15 @@ static inline int __send_flush_snap(struct inode *inode, arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; + arg.encrypted = IS_ENCRYPTED(inode); + + /* No fscrypt_auth changes from a capsnap.*/ + arg.fscrypt_auth_len = 0; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), + GFP_NOFS, false); + if (!msg) + return -ENOMEM; encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); @@ -1532,11 +1632,13 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; u64 first_tid = 1, last_tid = 0; - dout("__flush_snaps %p session %p\n", inode, session); + doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode), + session); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { /* @@ -1551,7 +1653,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, /* only flush each capsnap once */ if (capsnap->cap_flush.tid > 0) { - dout(" already flushed %p, skipping\n", capsnap); + doutc(cl, "already flushed %p, skipping\n", capsnap); continue; } @@ -1583,8 +1685,8 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, int ret; if (!(cap && cap->session == session)) { - dout("__flush_snaps %p auth cap %p not mds%d, " - "stop\n", inode, cap, session->s_mds); + doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n", + inode, ceph_vinop(inode), cap, session->s_mds); break; } @@ -1605,15 +1707,17 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); - dout("__flush_snaps %p capsnap %p tid %llu %s\n", - inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); + doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, + ceph_vinop(inode), capsnap, cf->tid, + ceph_cap_string(capsnap->dirty)); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { - pr_err("__flush_snaps: error sending cap flushsnap, " - "ino (%llx.%llx) tid %llu follows %llu\n", - ceph_vinop(inode), cf->tid, capsnap->follows); + pr_err_client(cl, "error sending cap flushsnap, " + "ino (%llx.%llx) tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, + capsnap->follows); } ceph_put_cap_snap(capsnap); @@ -1625,28 +1729,29 @@ void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { struct inode *inode = &ci->netfs.inode; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_mds_session *session = NULL; bool need_put = false; int mds; - dout("ceph_flush_snaps %p\n", inode); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (psession) session = *psession; retry: spin_lock(&ci->i_ceph_lock); if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { - dout(" no capsnap needs flush, doing nothing\n"); + doutc(cl, " no capsnap needs flush, doing nothing\n"); goto out; } if (!ci->i_auth_cap) { - dout(" no auth cap (migrating?), doing nothing\n"); + doutc(cl, " no auth cap (migrating?), doing nothing\n"); goto out; } mds = ci->i_auth_cap->session->s_mds; if (session && session->s_mds != mds) { - dout(" oops, wrong session %p mutex\n", session); + doutc(cl, " oops, wrong session %p mutex\n", session); ceph_put_mds_session(session); session = NULL; } @@ -1690,23 +1795,25 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc; + ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc; struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int was = ci->i_dirty_caps; int dirty = 0; lockdep_assert_held(&ci->i_ceph_lock); if (!ci->i_auth_cap) { - pr_warn("__mark_dirty_caps %p %llx mask %s, " - "but no auth cap (session was closed?)\n", - inode, ceph_ino(inode), ceph_cap_string(mask)); + pr_warn_client(cl, "%p %llx.%llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_vinop(inode), + ceph_cap_string(mask)); return 0; } - dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode, - ceph_cap_string(mask), ceph_cap_string(was), - ceph_cap_string(was | mask)); + doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode, + ceph_vinop(inode), ceph_cap_string(mask), + ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { struct ceph_mds_session *session = ci->i_auth_cap->session; @@ -1719,8 +1826,9 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } - dout(" inode %p now dirty snapc %p auth cap %p\n", - &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap); + doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n", + inode, ceph_vinop(inode), ci->i_head_snapc, + ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); @@ -1813,7 +1921,8 @@ static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, u64 *oldest_flush_tid) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap_flush *cf = NULL; int flushing; @@ -1824,13 +1933,13 @@ static u64 __mark_caps_flushing(struct inode *inode, BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; - dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", - ceph_cap_string(flushing), - ceph_cap_string(ci->i_flushing_caps), - ceph_cap_string(ci->i_flushing_caps | flushing)); + doutc(cl, "flushing %s, flushing_caps %s -> %s\n", + ceph_cap_string(flushing), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps | flushing)); ci->i_flushing_caps |= flushing; ci->i_dirty_caps = 0; - dout(" inode %p now !dirty\n", inode); + doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; @@ -1861,6 +1970,7 @@ static int try_nonblocking_invalidate(struct inode *inode) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; @@ -1872,12 +1982,13 @@ static int try_nonblocking_invalidate(struct inode *inode) if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ - dout("try_nonblocking_invalidate %p success\n", inode); + doutc(cl, "%p %llx.%llx success\n", inode, + ceph_vinop(inode)); /* save any racing async invalidate some trouble */ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } - dout("try_nonblocking_invalidate %p failed\n", inode); + doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode)); return -1; } @@ -1909,6 +2020,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; @@ -1983,9 +2095,9 @@ retry: } } - dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode), - ceph_cap_string(file_wanted), + doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " + "flushing %s issued %s revoking %s retain %s %s%s%s\n", + inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), @@ -2006,10 +2118,10 @@ retry: (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { - dout("check_caps trying to invalidate on %llx.%llx\n", - ceph_vinop(inode)); + doutc(cl, "trying to invalidate on %p %llx.%llx\n", + inode, ceph_vinop(inode)); if (try_nonblocking_invalidate(inode) < 0) { - dout("check_caps queuing invalidate\n"); + doutc(cl, "queuing invalidate\n"); queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } @@ -2037,35 +2149,35 @@ retry: cap_used &= ~ci->i_auth_cap->issued; revoking = cap->implemented & ~cap->issued; - dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", - cap->mds, cap, ceph_cap_string(cap_used), - ceph_cap_string(cap->issued), - ceph_cap_string(cap->implemented), - ceph_cap_string(revoking)); + doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ if (ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) { - dout("requesting new max_size\n"); + doutc(cl, "requesting new max_size\n"); goto ack; } /* approaching file_max? */ if (__ceph_should_report_size(ci)) { - dout("i_size approaching max_size\n"); + doutc(cl, "i_size approaching max_size\n"); goto ack; } } /* flush anything dirty? */ if (cap == ci->i_auth_cap) { if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { - dout("flushing dirty caps\n"); + doutc(cl, "flushing dirty caps\n"); goto ack; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { - dout("flushing snap caps\n"); + doutc(cl, "flushing snap caps\n"); goto ack; } } @@ -2073,7 +2185,7 @@ retry: /* completed revocation? going down and there are no caps? */ if (revoking) { if ((revoking & cap_used) == 0) { - dout("completed revocation of %s\n", + doutc(cl, "completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; } @@ -2172,7 +2284,7 @@ ack: */ static int try_flush_caps(struct inode *inode, u64 *ptid) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing = 0; u64 flush_tid = 0, oldest_flush_tid = 0; @@ -2250,7 +2362,8 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) */ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; @@ -2340,8 +2453,9 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) kfree(sessions); } - dout("%s %p wait on tid %llu %llu\n", __func__, - inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); + doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode, + ceph_vinop(inode), req1 ? req1->r_tid : 0ULL, + req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, ceph_timeout_jiffies(req1->r_timeout)); @@ -2367,11 +2481,13 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int ret, err; int dirty; - dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); + doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode), + datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); if (datasync) @@ -2382,7 +2498,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) goto out; dirty = try_flush_caps(inode, &flush_tid); - dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty)); err = flush_mdlog_and_wait_inode_unsafe_requests(inode); @@ -2403,7 +2519,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (err < 0) ret = err; out: - dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); + doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode), + datasync ? " datasync" : "", ret); return ret; } @@ -2416,12 +2533,13 @@ out: int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int err = 0; int dirty; int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); - dout("write_inode %p wait=%d\n", inode, wait); + doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { err = ceph_wait_on_async_create(inode); @@ -2433,7 +2551,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_fs_client(inode->i_sb)->mdsc; spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) @@ -2451,6 +2569,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; @@ -2476,8 +2595,8 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { - pr_err("%p auth cap %p not mds%d ???\n", - inode, cap, session->s_mds); + pr_err_client(cl, "%p auth cap %p not mds%d ???\n", + inode, cap, session->s_mds); break; } @@ -2486,8 +2605,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, if (!cf->is_capsnap) { struct cap_msg_args arg; - dout("kick_flushing_caps %p cap %p tid %llu %s\n", - inode, cap, cf->tid, ceph_cap_string(cf->caps)); + doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n", + inode, ceph_vinop(inode), cap, cf->tid, + ceph_cap_string(cf->caps)); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), @@ -2501,9 +2621,9 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); - dout("kick_flushing_caps %p capsnap %p tid %llu %s\n", - inode, capsnap, cf->tid, - ceph_cap_string(capsnap->dirty)); + doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", + inode, ceph_vinop(inode), capsnap, cf->tid, + ceph_cap_string(capsnap->dirty)); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); @@ -2511,11 +2631,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { - pr_err("kick_flushing_caps: error sending " - "cap flushsnap, ino (%llx.%llx) " - "tid %llu follows %llu\n", - ceph_vinop(inode), cf->tid, - capsnap->follows); + pr_err_client(cl, "error sending cap flushsnap," + " %p %llx.%llx tid %llu follows %llu\n", + inode, ceph_vinop(inode), cf->tid, + capsnap->follows); } ceph_put_cap_snap(capsnap); @@ -2528,22 +2647,26 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; - dout("early_kick_flushing_caps mds%d\n", session->s_mds); + doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + struct inode *inode = &ci->netfs.inode; + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { - pr_err("%p auth cap %p not mds%d ???\n", - &ci->netfs.inode, cap, session->s_mds); + pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", + inode, ceph_vinop(inode), cap, + session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2576,24 +2699,28 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; lockdep_assert_held(&session->s_mutex); - dout("kick_flushing_caps mds%d\n", session->s_mds); + doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + struct inode *inode = &ci->netfs.inode; + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { - pr_err("%p auth cap %p not mds%d ???\n", - &ci->netfs.inode, cap, session->s_mds); + pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", + inode, ceph_vinop(inode), cap, + session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } @@ -2610,11 +2737,13 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap *cap = ci->i_auth_cap; + struct inode *inode = &ci->netfs.inode; lockdep_assert_held(&ci->i_ceph_lock); - dout("%s %p flushing %s\n", __func__, &ci->netfs.inode, - ceph_cap_string(ci->i_flushing_caps)); + doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n", + inode, ceph_vinop(inode), + ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { u64 oldest_flush_tid; @@ -2636,6 +2765,9 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, bool snap_rwsem_locked) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + lockdep_assert_held(&ci->i_ceph_lock); if (got & CEPH_CAP_PIN) @@ -2656,10 +2788,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) - ihold(&ci->netfs.inode); + ihold(inode); ci->i_wb_ref++; - dout("%s %p wb %d -> %d (?)\n", __func__, - &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref); + doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, + ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -2686,20 +2818,23 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, loff_t endoff, int flags, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; int have, implemented; bool snap_rwsem_locked = false; - dout("get_cap_refs %p need %s want %s\n", inode, - ceph_cap_string(need), ceph_cap_string(want)); + doutc(cl, "%p %llx.%llx need %s want %s\n", inode, + ceph_vinop(inode), ceph_cap_string(need), + ceph_cap_string(want)); again: spin_lock(&ci->i_ceph_lock); if ((flags & CHECK_FILELOCK) && (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { - dout("try_get_cap_refs %p error filelock\n", inode); + doutc(cl, "%p %llx.%llx error filelock\n", inode, + ceph_vinop(inode)); ret = -EIO; goto out_unlock; } @@ -2719,8 +2854,8 @@ again: if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { - dout("get_cap_refs %p endoff %llu > maxsize %llu\n", - inode, endoff, ci->i_max_size); + doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n", + inode, ceph_vinop(inode), endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; goto out_unlock; @@ -2730,7 +2865,8 @@ again: * can get a final snapshot value for size+mtime. */ if (__ceph_have_pending_cap_snap(ci)) { - dout("get_cap_refs %p cap_snap_pending\n", inode); + doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode, + ceph_vinop(inode)); goto out_unlock; } } @@ -2748,9 +2884,9 @@ again: int not = want & ~(have & need); int revoking = implemented & ~have; int exclude = revoking & not; - dout("get_cap_refs %p have %s but not %s (revoking %s)\n", - inode, ceph_cap_string(have), ceph_cap_string(not), - ceph_cap_string(revoking)); + doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n", + inode, ceph_vinop(inode), ceph_cap_string(have), + ceph_cap_string(not), ceph_cap_string(revoking)); if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && @@ -2790,28 +2926,31 @@ again: spin_unlock(&s->s_cap_lock); } if (session_readonly) { - dout("get_cap_refs %p need %s but mds%d readonly\n", - inode, ceph_cap_string(need), ci->i_auth_cap->mds); + doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n", + inode, ceph_vinop(inode), ceph_cap_string(need), + ci->i_auth_cap->mds); ret = -EROFS; goto out_unlock; } if (ceph_inode_is_shutdown(inode)) { - dout("get_cap_refs %p inode is shutdown\n", inode); + doutc(cl, "%p %llx.%llx inode is shutdown\n", + inode, ceph_vinop(inode)); ret = -ESTALE; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); if (need & ~mds_wanted) { - dout("get_cap_refs %p need %s > mds_wanted %s\n", - inode, ceph_cap_string(need), - ceph_cap_string(mds_wanted)); + doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n", + inode, ceph_vinop(inode), ceph_cap_string(need), + ceph_cap_string(mds_wanted)); ret = -EUCLEAN; goto out_unlock; } - dout("get_cap_refs %p have %s need %s\n", inode, - ceph_cap_string(have), ceph_cap_string(need)); + doutc(cl, "%p %llx.%llx have %s need %s\n", inode, + ceph_vinop(inode), ceph_cap_string(have), + ceph_cap_string(need)); } out_unlock: @@ -2826,8 +2965,8 @@ out_unlock: else if (ret == 1) ceph_update_cap_hit(&mdsc->metric); - dout("get_cap_refs %p ret %d got %s\n", inode, - ret, ceph_cap_string(*got)); + doutc(cl, "%p %llx.%llx ret %d got %s\n", inode, + ceph_vinop(inode), ret, ceph_cap_string(*got)); return ret; } @@ -2839,13 +2978,14 @@ out_unlock: static void check_max_size(struct inode *inode, loff_t endoff) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int check = 0; /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { - dout("write %p at large endoff %llu, req max_size\n", - inode, endoff); + doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n", + inode, ceph_vinop(inode), endoff); ci->i_wanted_max_size = endoff; } /* duplicate ceph_check_caps()'s logic */ @@ -2900,19 +3040,18 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) +int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, + int want, loff_t endoff, int *got) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; @@ -2965,7 +3104,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got continue; } - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); @@ -3028,6 +3167,15 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return 0; } +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, + int *got) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + + return __ceph_get_caps(inode, fi, need, want, endoff, got); +} + /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. @@ -3047,10 +3195,12 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); + if (!capsnap->need_flush && !capsnap->writing && !capsnap->dirty_pages) { - dout("dropping cap_snap %p follows %llu\n", - capsnap, capsnap->follows); + doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows); BUG_ON(capsnap->cap_flush.tid > 0); ceph_put_snap_context(capsnap->context); if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) @@ -3082,6 +3232,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; @@ -3104,8 +3255,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, put++; check_flushsnaps = true; } - dout("put_cap_refs %p wb %d -> %d (?)\n", - inode, ci->i_wb_ref+1, ci->i_wb_ref); + doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, + ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { @@ -3145,8 +3296,8 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, } spin_unlock(&ci->i_ceph_lock); - dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), - last ? " last" : "", put ? " put" : ""); + doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode), + ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); switch (mode) { case PUT_CAP_REFS_SYNC: @@ -3196,6 +3347,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; @@ -3219,11 +3371,10 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } - dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n", - inode, - ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, - ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, - last ? " LAST" : ""); + doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n", + inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr, + ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref, + ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->context == snapc) { @@ -3253,13 +3404,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } } } - dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s\n", - inode, capsnap, capsnap->context->seq, - ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, - ci->i_wrbuffer_ref, capsnap->dirty_pages, - last ? " (wrbuffer last)" : "", - complete_capsnap ? " (complete capsnap)" : ""); + doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n", + inode, ceph_vinop(inode), capsnap, capsnap->context->seq, + ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, + ci->i_wrbuffer_ref, capsnap->dirty_pages, + last ? " (wrbuffer last)" : "", + complete_capsnap ? " (complete capsnap)" : ""); } unlock: @@ -3282,9 +3432,10 @@ unlock: */ static void invalidate_aliases(struct inode *inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct dentry *dn, *prev = NULL; - dout("invalidate_aliases inode %p\n", inode); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns @@ -3323,6 +3474,9 @@ struct cap_extra_info { /* currently issued */ int issued; struct timespec64 btime; + u8 *fscrypt_auth; + u32 fscrypt_auth_len; + u64 fscrypt_file_size; }; /* @@ -3340,6 +3494,7 @@ static void handle_cap_grant(struct inode *inode, __releases(ci->i_ceph_lock) __releases(session->s_mdsc->snap_rwsem) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); @@ -3355,10 +3510,19 @@ static void handle_cap_grant(struct inode *inode, bool deleted_inode = false; bool fill_inline = false; - dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", - inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); - dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, - i_size_read(inode)); + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + + doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode, + ceph_vinop(inode), cap, session->s_mds, seq, + ceph_cap_string(newcaps)); + doutc(cl, " size %llu max_size %llu, i_size %llu\n", size, + max_size, i_size_read(inode)); /* @@ -3418,9 +3582,19 @@ static void handle_cap_grant(struct inode *inode, inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; - dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - from_kuid(&init_user_ns, inode->i_uid), - from_kgid(&init_user_ns, inode->i_gid)); + doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, + ceph_vinop(inode), inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || + memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, + ci->fscrypt_auth_len)) + pr_warn_ratelimited_client(cl, + "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", + ci->fscrypt_auth_len, + extra_info->fscrypt_auth_len); +#endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && @@ -3436,8 +3610,8 @@ static void handle_cap_grant(struct inode *inode, u64 version = le64_to_cpu(grant->xattr_version); if (version > ci->i_xattrs.version) { - dout(" got new xattrs v%llu on %p len %d\n", - version, inode, len); + doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n", + version, inode, ceph_vinop(inode), len); if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); @@ -3488,8 +3662,8 @@ static void handle_cap_grant(struct inode *inode, if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { if (max_size != ci->i_max_size) { - dout("max_size %lld -> %llu\n", - ci->i_max_size, max_size); + doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size, + max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { ci->i_wanted_max_size = 0; /* reset */ @@ -3503,10 +3677,9 @@ static void handle_cap_grant(struct inode *inode, wanted = __ceph_caps_wanted(ci); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); - dout(" my wanted = %s, used = %s, dirty %s\n", - ceph_cap_string(wanted), - ceph_cap_string(used), - ceph_cap_string(dirty)); + doutc(cl, " my wanted = %s, used = %s, dirty %s\n", + ceph_cap_string(wanted), ceph_cap_string(used), + ceph_cap_string(dirty)); if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && (wanted & ~(cap->mds_wanted | newcaps))) { @@ -3527,10 +3700,9 @@ static void handle_cap_grant(struct inode *inode, if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; - dout("revocation: %s -> %s (revoking %s)\n", - ceph_cap_string(cap->issued), - ceph_cap_string(newcaps), - ceph_cap_string(revoking)); + doutc(cl, "revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), ceph_cap_string(newcaps), + ceph_cap_string(revoking)); if (S_ISREG(inode->i_mode) && (revoking & used & CEPH_CAP_FILE_BUFFER)) writeback = true; /* initiate writeback; will delay ack */ @@ -3548,11 +3720,12 @@ static void handle_cap_grant(struct inode *inode, cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { - dout("caps unchanged: %s -> %s\n", - ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + doutc(cl, "caps unchanged: %s -> %s\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); } else { - dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), - ceph_cap_string(newcaps)); + doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued), + ceph_cap_string(newcaps)); /* non-auth MDS is revoking the newly grant caps ? */ if (cap == ci->i_auth_cap && __ceph_caps_revoking_other(ci, cap, newcaps)) @@ -3640,7 +3813,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_flush *cf, *tmp_cf; LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); @@ -3677,11 +3851,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } - dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," - " flushing %s -> %s\n", - inode, session->s_mds, seq, ceph_cap_string(dirty), - ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), - ceph_cap_string(ci->i_flushing_caps & ~cleaned)); + doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n", + inode, ceph_vinop(inode), session->s_mds, seq, + ceph_cap_string(dirty), ceph_cap_string(cleaned), + ceph_cap_string(ci->i_flushing_caps), + ceph_cap_string(ci->i_flushing_caps & ~cleaned)); if (list_empty(&to_remove) && !cleaned) goto out; @@ -3697,18 +3871,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, if (list_empty(&ci->i_cap_flush_list)) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) { - dout(" mds%d still flushing cap on %p\n", - session->s_mds, - &list_first_entry(&session->s_cap_flushing, - struct ceph_inode_info, - i_flushing_item)->netfs.inode); + struct inode *inode = + &list_first_entry(&session->s_cap_flushing, + struct ceph_inode_info, + i_flushing_item)->netfs.inode; + doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n", + session->s_mds, inode, ceph_vinop(inode)); } } mdsc->num_cap_flushing--; - dout(" inode %p now !flushing\n", inode); + doutc(cl, " %p %llx.%llx now !flushing\n", inode, + ceph_vinop(inode)); if (ci->i_dirty_caps == 0) { - dout(" inode %p now clean\n", inode); + doutc(cl, " %p %llx.%llx now clean\n", inode, + ceph_vinop(inode)); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = true; if (ci->i_wr_ref == 0 && @@ -3746,12 +3923,14 @@ void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; bool ret; lockdep_assert_held(&ci->i_ceph_lock); - dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); + doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap, + inode, ceph_vinop(inode), ci); list_del_init(&capsnap->ci_item); ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); @@ -3790,29 +3969,31 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; - dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", - inode, ci, session->s_mds, follows); + doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode, + ceph_vinop(inode), ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->follows == follows) { if (iter->cap_flush.tid != flush_tid) { - dout(" cap_snap %p follows %lld tid %lld !=" - " %lld\n", iter, follows, - flush_tid, iter->cap_flush.tid); + doutc(cl, " cap_snap %p follows %lld " + "tid %lld != %lld\n", iter, + follows, flush_tid, + iter->cap_flush.tid); break; } capsnap = iter; break; } else { - dout(" skipping cap_snap %p follows %lld\n", - iter, iter->follows); + doutc(cl, " skipping cap_snap %p follows %lld\n", + iter, iter->follows); } } if (capsnap) @@ -3837,9 +4018,11 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int mds = session->s_mds; int seq = le32_to_cpu(trunc->seq); u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); @@ -3854,8 +4037,16 @@ static bool handle_cap_trunc(struct inode *inode, issued |= implemented | dirty; - dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", - inode, mds, seq, truncate_size, truncate_seq); + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + + doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n", + inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; @@ -3873,7 +4064,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *tsession = NULL; struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); @@ -3893,8 +4085,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, target = -1; } - dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", - inode, ci, mds, mseq, target); + doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n", + inode, ceph_vinop(inode), ci, mds, mseq, target); retry: down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); @@ -3903,7 +4095,7 @@ retry: goto out_unlock; if (target < 0) { - ceph_remove_cap(cap, false); + ceph_remove_cap(mdsc, cap, false); goto out_unlock; } @@ -3914,12 +4106,13 @@ retry: issued = cap->issued; if (issued != cap->implemented) - pr_err_ratelimited("handle_cap_export: issued != implemented: " - "ino (%llx.%llx) mds%d seq %d mseq %d " - "issued %s implemented %s\n", - ceph_vinop(inode), mds, cap->seq, cap->mseq, - ceph_cap_string(issued), - ceph_cap_string(cap->implemented)); + pr_err_ratelimited_client(cl, "issued != implemented: " + "%p %llx.%llx mds%d seq %d mseq %d" + " issued %s implemented %s\n", + inode, ceph_vinop(inode), mds, + cap->seq, cap->mseq, + ceph_cap_string(issued), + ceph_cap_string(cap->implemented)); tcap = __get_cap_for_mds(ci, target); @@ -3927,7 +4120,8 @@ retry: /* already have caps from the target */ if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_seq) < 0) { - dout(" updating import cap %p mds%d\n", tcap, target); + doutc(cl, " updating import cap %p mds%d\n", tcap, + target); tcap->cap_id = t_cap_id; tcap->seq = t_seq - 1; tcap->issue_seq = t_seq - 1; @@ -3938,7 +4132,7 @@ retry: change_auth_cap_ses(ci, tcap->session); } } - ceph_remove_cap(cap, false); + ceph_remove_cap(mdsc, cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export tagert */ @@ -3955,7 +4149,7 @@ retry: spin_unlock(&mdsc->cap_dirty_lock); } - ceph_remove_cap(cap, false); + ceph_remove_cap(mdsc, cap, false); goto out_unlock; } @@ -4008,6 +4202,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, struct ceph_cap **target_cap, int *old_issued) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; int issued; @@ -4028,8 +4223,8 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, peer = -1; } - dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", - inode, ci, mds, mseq, peer); + doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n", + inode, ceph_vinop(inode), ci, mds, mseq, peer); retry: cap = __get_cap_for_mds(ci, mds); if (!cap) { @@ -4055,26 +4250,72 @@ retry: ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; if (ocap && ocap->cap_id == p_cap_id) { - dout(" remove export cap %p mds%d flags %d\n", - ocap, peer, ph->flags); + doutc(cl, " remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && (ocap->seq != le32_to_cpu(ph->seq) || ocap->mseq != le32_to_cpu(ph->mseq))) { - pr_err_ratelimited("handle_cap_import: " - "mismatched seq/mseq: ino (%llx.%llx) " - "mds%d seq %d mseq %d importer mds%d " - "has peer seq %d mseq %d\n", - ceph_vinop(inode), peer, ocap->seq, - ocap->mseq, mds, le32_to_cpu(ph->seq), + pr_err_ratelimited_client(cl, "mismatched seq/mseq: " + "%p %llx.%llx mds%d seq %d mseq %d" + " importer mds%d has peer seq %d mseq %d\n", + inode, ceph_vinop(inode), peer, + ocap->seq, ocap->mseq, mds, + le32_to_cpu(ph->seq), le32_to_cpu(ph->mseq)); } - ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; *target_cap = cap; } +#ifdef CONFIG_FS_ENCRYPTION +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); + if (extra->fscrypt_auth_len) { + ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); + extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, + GFP_KERNEL); + if (!extra->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, extra->fscrypt_auth, + extra->fscrypt_auth_len, bad); + } + + ceph_decode_32_safe(p, end, len, bad); + if (len >= sizeof(u64)) { + ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); + len -= sizeof(u64); + } + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#else +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + /* Don't care about these fields unless we're encryption-capable */ + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#endif + /* * Handle a caps message from the MDS. * @@ -4085,6 +4326,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; @@ -4103,7 +4345,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, bool close_sessions = false; bool do_cap_release = false; - dout("handle_caps from mds%d\n", session->s_mds); + doutc(cl, "from mds%d\n", session->s_mds); + + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; /* decode */ end = msg->front.iov_base + msg->front.iov_len; @@ -4195,18 +4440,22 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } + if (msg_version >= 12) { + if (parse_fscrypt_fields(&p, end, &extra_info)) + goto bad; + } + /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); - dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, - vino.snap, inode); + doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), + vino.ino, vino.snap, inode); mutex_lock(&session->s_mutex); - inc_session_sequence(session); - dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, - (unsigned)seq); + doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds, + session->s_seq, (unsigned)seq); if (!inode) { - dout(" i don't have ino %llx\n", vino.ino); + doutc(cl, " i don't have ino %llx\n", vino.ino); switch (op) { case CEPH_CAP_OP_IMPORT: @@ -4261,9 +4510,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { - dout(" no cap on %p ino %llx.%llx from mds%d\n", - inode, ceph_ino(inode), ceph_snap(inode), - session->s_mds); + doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", + inode, ceph_ino(inode), ceph_snap(inode), + session->s_mds); spin_unlock(&ci->i_ceph_lock); switch (op) { case CEPH_CAP_OP_REVOKE: @@ -4292,7 +4541,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; case CEPH_CAP_OP_TRUNC: - queue_trunc = handle_cap_trunc(inode, h, session); + queue_trunc = handle_cap_trunc(inode, h, session, + &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); @@ -4300,8 +4550,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, default: spin_unlock(&ci->i_ceph_lock); - pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, - ceph_cap_op_name(op)); + pr_err_client(cl, "unknown cap op %d %s\n", op, + ceph_cap_op_name(op)); } done: @@ -4309,12 +4559,15 @@ done: done_unlocked: iput(inode); out: + ceph_dec_mds_stopping_blocker(mdsc); + ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); + kfree(extra_info.fscrypt_auth); return; flush_cap_releases: @@ -4339,7 +4592,7 @@ flush_cap_releases: goto done; bad: - pr_err("ceph_handle_caps: corrupt message\n"); + pr_err_client(cl, "corrupt message\n"); ceph_msg_dump(msg); goto out; } @@ -4353,6 +4606,7 @@ bad: */ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_mount_options *opt = mdsc->fsc->mount_options; @@ -4360,14 +4614,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) unsigned long loop_start = jiffies; unsigned long delay = 0; - dout("check_delayed_caps\n"); + doutc(cl, "begin\n"); spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { - dout("%s caps added recently. Exiting loop", __func__); + doutc(cl, "caps added recently. Exiting loop"); delay = ci->i_hold_caps_max; break; } @@ -4379,13 +4633,15 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); - dout("check_delayed_caps on %p\n", inode); + doutc(cl, "on %p %llx.%llx\n", inode, + ceph_vinop(inode)); ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } } spin_unlock(&mdsc->cap_delay_lock); + doutc(cl, "done\n"); return delay; } @@ -4396,17 +4652,18 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) static void flush_dirty_session_caps(struct ceph_mds_session *s) { struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; - dout("flush_dirty_caps\n"); + doutc(cl, "begin\n"); spin_lock(&mdsc->cap_dirty_lock); while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); inode = &ci->netfs.inode; ihold(inode); - dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH); @@ -4414,7 +4671,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); - dout("flush_dirty_caps done\n"); + doutc(cl, "done\n"); } void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) @@ -4519,7 +4776,7 @@ int ceph_drop_caps_for_unlink(struct inode *inode) if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = - ceph_inode_to_client(inode)->mdsc; + ceph_inode_to_fs_client(inode)->mdsc; __cap_delay_requeue_front(mdsc, ci); } } @@ -4539,6 +4796,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; int used, dirty; @@ -4548,9 +4806,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode, used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); - dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n", - inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), - ceph_cap_string(unless)); + doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n", + inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty), + ceph_cap_string(drop), ceph_cap_string(unless)); /* only drop unused, clean caps */ drop &= ~(used | dirty); @@ -4572,12 +4830,13 @@ int ceph_encode_inode_release(void **p, struct inode *inode, if (force || (cap->issued & drop)) { if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); - dout("encode_inode_release %p cap %p " - "%s -> %s, wanted %s -> %s\n", inode, cap, - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(wanted)); + doutc(cl, "%p %llx.%llx cap %p %s -> %s, " + "wanted %s -> %s\n", inode, + ceph_vinop(inode), cap, + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); cap->issued &= ~drop; cap->implemented &= ~drop; @@ -4586,9 +4845,9 @@ int ceph_encode_inode_release(void **p, struct inode *inode, !(wanted & CEPH_CAP_ANY_FILE_WR)) ci->i_requested_max_size = 0; } else { - dout("encode_inode_release %p cap %p %s" - " (force)\n", inode, cap, - ceph_cap_string(cap->issued)); + doutc(cl, "%p %llx.%llx cap %p %s (force)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued)); } rel->ino = cpu_to_le64(ceph_ino(inode)); @@ -4603,14 +4862,27 @@ int ceph_encode_inode_release(void **p, struct inode *inode, *p += sizeof(*rel); ret = 1; } else { - dout("encode_inode_release %p cap %p %s (noop)\n", - inode, cap, ceph_cap_string(cap->issued)); + doutc(cl, "%p %llx.%llx cap %p %s (noop)\n", + inode, ceph_vinop(inode), cap, + ceph_cap_string(cap->issued)); } } spin_unlock(&ci->i_ceph_lock); return ret; } +/** + * ceph_encode_dentry_release - encode a dentry release into an outgoing request + * @p: outgoing request buffer + * @dentry: dentry to release + * @dir: dir to release it from + * @mds: mds that we're speaking to + * @drop: caps being dropped + * @unless: unless we have these caps + * + * Encode a dentry release into an outgoing request buffer. Returns 1 if the + * thing was released, or a negative error code otherwise. + */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) @@ -4618,6 +4890,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); + struct ceph_client *cl; int force = 0; int ret; @@ -4639,29 +4912,44 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); dput(parent); + cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { - dout("encode_dentry_release %p mds%d seq %d\n", - dentry, mds, (int)di->lease_seq); - rel->dname_len = cpu_to_le32(dentry->d_name.len); - memcpy(*p, dentry->d_name.name, dentry->d_name.len); - *p += dentry->d_name.len; + doutc(cl, "%p mds%d seq %d\n", dentry, mds, + (int)di->lease_seq); rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { + int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); + + if (ret2 < 0) + return ret2; + + rel->dname_len = cpu_to_le32(ret2); + *p += ret2; + } else { + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + } + } else { + spin_unlock(&dentry->d_lock); } - spin_unlock(&dentry->d_lock); return ret; } static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; int capsnap_release = 0; lockdep_assert_held(&ci->i_ceph_lock); - dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); + doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n", + ci, inode, ceph_vinop(inode)); while (!list_empty(&ci->i_cap_snaps)) { capsnap = list_first_entry(&ci->i_cap_snaps, @@ -4678,8 +4966,9 @@ static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) { - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client *cl = fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); bool is_auth; bool dirty_dropped = false; @@ -4687,8 +4976,8 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali lockdep_assert_held(&ci->i_ceph_lock); - dout("removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->netfs.inode); + doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n", + cap, ci, inode, ceph_vinop(inode)); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); @@ -4715,19 +5004,19 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali } if (!list_empty(&ci->i_dirty_item)) { - pr_warn_ratelimited( - " dropping dirty %s state for %p %lld\n", + pr_warn_ratelimited_client(cl, + " dropping dirty %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_dirty_caps), - inode, ceph_ino(inode)); + inode, ceph_vinop(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { - pr_warn_ratelimited( - " dropping dirty+flushing %s state for %p %lld\n", + pr_warn_ratelimited_client(cl, + " dropping dirty+flushing %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_flushing_caps), - inode, ceph_ino(inode)); + inode, ceph_vinop(inode)); ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; @@ -4750,8 +5039,9 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; - pr_warn_ratelimited(" dropping file locks for %p %lld\n", - inode, ceph_ino(inode)); + pr_warn_ratelimited_client(cl, + " dropping file locks for %p %llx.%llx\n", + inode, ceph_vinop(inode)); } if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c new file mode 100644 index 000000000000..3b3c4d8d401e --- /dev/null +++ b/fs/ceph/crypto.c @@ -0,0 +1,685 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The base64 encode/decode code was copied from fscrypt: + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * Written by Uday Savagaonkar, 2014. + * Modified by Jaegeuk Kim, 2015. + */ +#include <linux/ceph/ceph_debug.h> +#include <linux/xattr.h> +#include <linux/fscrypt.h> +#include <linux/ceph/striper.h> + +#include "super.h" +#include "mds_client.h" +#include "crypto.h" + +/* + * The base64url encoding used by fscrypt includes the '_' character, which may + * cause problems in snapshot names (which can not start with '_'). Thus, we + * used the base64 encoding defined for IMAP mailbox names (RFC 3501) instead, + * which replaces '-' and '_' by '+' and ','. + */ +static const char base64_table[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +int ceph_base64_encode(const u8 *src, int srclen, char *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + char *cp = dst; + + for (i = 0; i < srclen; i++) { + ac = (ac << 8) | src[i]; + bits += 8; + do { + bits -= 6; + *cp++ = base64_table[(ac >> bits) & 0x3f]; + } while (bits >= 6); + } + if (bits) + *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; + return cp - dst; +} + +int ceph_base64_decode(const char *src, int srclen, u8 *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + u8 *bp = dst; + + for (i = 0; i < srclen; i++) { + const char *p = strchr(base64_table, src[i]); + + if (p == NULL || src[i] == 0) + return -1; + ac = (ac << 6) | (p - base64_table); + bits += 6; + if (bits >= 8) { + bits -= 8; + *bp++ = (u8)(ac >> bits); + } + } + if (ac & ((1 << bits) - 1)) + return -1; + return bp - dst; +} + +static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth; + u32 ctxlen; + + /* Non existent or too short? */ + if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1))) + return -ENOBUFS; + + /* Some format we don't recognize? */ + if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION) + return -ENOBUFS; + + ctxlen = le32_to_cpu(cfa->cfa_blob_len); + if (len < ctxlen) + return -ERANGE; + + memcpy(ctx, cfa->cfa_blob, ctxlen); + return ctxlen; +} + +static int ceph_crypt_set_context(struct inode *inode, const void *ctx, + size_t len, void *fs_data) +{ + int ret; + struct iattr attr = { }; + struct ceph_iattr cia = { }; + struct ceph_fscrypt_auth *cfa; + + WARN_ON_ONCE(fs_data); + + if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE) + return -EINVAL; + + cfa = kzalloc(sizeof(*cfa), GFP_KERNEL); + if (!cfa) + return -ENOMEM; + + cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + cfa->cfa_blob_len = cpu_to_le32(len); + memcpy(cfa->cfa_blob, ctx, len); + + cia.fscrypt_auth = cfa; + + ret = __ceph_setattr(&nop_mnt_idmap, inode, &attr, &cia); + if (ret == 0) + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + kfree(cia.fscrypt_auth); + return ret; +} + +static bool ceph_crypt_empty_dir(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return ci->i_rsubdirs + ci->i_rfiles == 1; +} + +static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) +{ + return ceph_sb_to_fs_client(sb)->fsc_dummy_enc_policy.policy; +} + +static struct fscrypt_operations ceph_fscrypt_ops = { + .needs_bounce_pages = 1, + .get_context = ceph_crypt_get_context, + .set_context = ceph_crypt_set_context, + .get_dummy_policy = ceph_get_dummy_policy, + .empty_dir = ceph_crypt_empty_dir, +}; + +void ceph_fscrypt_set_ops(struct super_block *sb) +{ + fscrypt_set_ops(sb, &ceph_fscrypt_ops); +} + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ + fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy); +} + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + int ret, ctxsize; + bool encrypted = false; + struct ceph_inode_info *ci = ceph_inode(inode); + + ret = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (ret) + return ret; + if (!encrypted) + return 0; + + as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL); + if (!as->fscrypt_auth) + return -ENOMEM; + + ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob, + inode); + if (ctxsize < 0) + return ctxsize; + + as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize); + + WARN_ON_ONCE(ci->fscrypt_auth); + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth); + ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len, + GFP_KERNEL); + if (!ci->fscrypt_auth) + return -ENOMEM; + + inode->i_flags |= S_ENCRYPTED; + + return 0; +} + +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as) +{ + swap(req->r_fscrypt_auth, as->fscrypt_auth); +} + +/* + * User-created snapshots can't start with '_'. Snapshots that start with this + * character are special (hint: there aren't real snapshots) and use the + * following format: + * + * _<SNAPSHOT-NAME>_<INODE-NUMBER> + * + * where: + * - <SNAPSHOT-NAME> - the real snapshot name that may need to be decrypted, + * - <INODE-NUMBER> - the inode number (in decimal) for the actual snapshot + * + * This function parses these snapshot names and returns the inode + * <INODE-NUMBER>. 'name_len' will also bet set with the <SNAPSHOT-NAME> + * length. + */ +static struct inode *parse_longname(const struct inode *parent, + const char *name, int *name_len) +{ + struct ceph_client *cl = ceph_inode_to_client(parent); + struct inode *dir = NULL; + struct ceph_vino vino = { .snap = CEPH_NOSNAP }; + char *inode_number; + char *name_end; + int orig_len = *name_len; + int ret = -EIO; + + /* Skip initial '_' */ + name++; + name_end = strrchr(name, '_'); + if (!name_end) { + doutc(cl, "failed to parse long snapshot name: %s\n", name); + return ERR_PTR(-EIO); + } + *name_len = (name_end - name); + if (*name_len <= 0) { + pr_err_client(cl, "failed to parse long snapshot name\n"); + return ERR_PTR(-EIO); + } + + /* Get the inode number */ + inode_number = kmemdup_nul(name_end + 1, + orig_len - *name_len - 2, + GFP_KERNEL); + if (!inode_number) + return ERR_PTR(-ENOMEM); + ret = kstrtou64(inode_number, 10, &vino.ino); + if (ret) { + doutc(cl, "failed to parse inode number: %s\n", name); + dir = ERR_PTR(ret); + goto out; + } + + /* And finally the inode */ + dir = ceph_find_inode(parent->i_sb, vino); + if (!dir) { + /* This can happen if we're not mounting cephfs on the root */ + dir = ceph_get_inode(parent->i_sb, vino, NULL); + if (IS_ERR(dir)) + doutc(cl, "can't find inode %s (%s)\n", inode_number, name); + } + +out: + kfree(inode_number); + return dir; +} + +int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, + char *buf) +{ + struct ceph_client *cl = ceph_inode_to_client(parent); + struct inode *dir = parent; + struct qstr iname; + u32 len; + int name_len; + int elen; + int ret; + u8 *cryptbuf = NULL; + + iname.name = d_name->name; + name_len = d_name->len; + + /* Handle the special case of snapshot names that start with '_' */ + if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && + (iname.name[0] == '_')) { + dir = parse_longname(parent, iname.name, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + iname.name++; /* skip initial '_' */ + } + iname.len = name_len; + + if (!fscrypt_has_encryption_key(dir)) { + memcpy(buf, d_name->name, d_name->len); + elen = d_name->len; + goto out; + } + + /* + * Convert cleartext d_name to ciphertext. If result is longer than + * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes + * + * See: fscrypt_setup_filename + */ + if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) { + elen = -ENAMETOOLONG; + goto out; + } + + /* Allocate a buffer appropriate to hold the result */ + cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len, + GFP_KERNEL); + if (!cryptbuf) { + elen = -ENOMEM; + goto out; + } + + ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len); + if (ret) { + elen = ret; + goto out; + } + + /* hash the end if the name is long enough */ + if (len > CEPH_NOHASH_NAME_MAX) { + u8 hash[SHA256_DIGEST_SIZE]; + u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX; + + /* + * hash the extra bytes and overwrite crypttext beyond that + * point with it + */ + sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash); + memcpy(extra, hash, SHA256_DIGEST_SIZE); + len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE; + } + + /* base64 encode the encrypted name */ + elen = ceph_base64_encode(cryptbuf, len, buf); + doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf); + + /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ + WARN_ON(elen > 240); + if ((elen > 0) && (dir != parent)) { + char tmp_buf[NAME_MAX]; + + elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + elen, buf, dir->i_ino); + memcpy(buf, tmp_buf, elen); + } + +out: + kfree(cryptbuf); + if (dir != parent) { + if ((dir->i_state & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return elen; +} + +int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, + char *buf) +{ + WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)); + + return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf); +} + +/** + * ceph_fname_to_usr - convert a filename for userland presentation + * @fname: ceph_fname to be converted + * @tname: temporary name buffer to use for conversion (may be NULL) + * @oname: where converted name should be placed + * @is_nokey: set to true if key wasn't available during conversion (may be NULL) + * + * Given a filename (usually from the MDS), format it for presentation to + * userland. If @parent is not encrypted, just pass it back as-is. + * + * Otherwise, base64 decode the string, and then ask fscrypt to format it + * for userland presentation. + * + * Returns 0 on success or negative error code on error. + */ +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + struct inode *dir = fname->dir; + struct fscrypt_str _tname = FSTR_INIT(NULL, 0); + struct fscrypt_str iname; + char *name = fname->name; + int name_len = fname->name_len; + int ret; + + /* Sanity check that the resulting name will fit in the buffer */ + if (fname->name_len > NAME_MAX || fname->ctext_len > NAME_MAX) + return -EIO; + + /* Handle the special case of snapshot names that start with '_' */ + if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && + (name[0] == '_')) { + dir = parse_longname(dir, name, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + name++; /* skip initial '_' */ + } + + if (!IS_ENCRYPTED(dir)) { + oname->name = fname->name; + oname->len = fname->name_len; + ret = 0; + goto out_inode; + } + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret) + goto out_inode; + + /* + * Use the raw dentry name as sent by the MDS instead of + * generating a nokey name via fscrypt. + */ + if (!fscrypt_has_encryption_key(dir)) { + if (fname->no_copy) + oname->name = fname->name; + else + memcpy(oname->name, fname->name, fname->name_len); + oname->len = fname->name_len; + if (is_nokey) + *is_nokey = true; + ret = 0; + goto out_inode; + } + + if (fname->ctext_len == 0) { + int declen; + + if (!tname) { + ret = fscrypt_fname_alloc_buffer(NAME_MAX, &_tname); + if (ret) + goto out_inode; + tname = &_tname; + } + + declen = ceph_base64_decode(name, name_len, tname->name); + if (declen <= 0) { + ret = -EIO; + goto out; + } + iname.name = tname->name; + iname.len = declen; + } else { + iname.name = fname->ctext; + iname.len = fname->ctext_len; + } + + ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname); + if (!ret && (dir != fname->dir)) { + char tmp_buf[CEPH_BASE64_CHARS(NAME_MAX)]; + + name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + oname->len, oname->name, dir->i_ino); + memcpy(oname->name, tmp_buf, name_len); + oname->len = name_len; + } + +out: + fscrypt_fname_free_buffer(&_tname); +out_inode: + if (dir != fname->dir) { + if ((dir->i_state & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return ret; +} + +/** + * ceph_fscrypt_prepare_readdir - simple __fscrypt_prepare_readdir() wrapper + * @dir: directory inode for readdir prep + * + * Simple wrapper around __fscrypt_prepare_readdir() that will mark directory as + * non-complete if this call results in having the directory unlocked. + * + * Returns: + * 1 - if directory was locked and key is now loaded (i.e. dir is unlocked) + * 0 - if directory is still locked + * < 0 - if __fscrypt_prepare_readdir() fails + */ +int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + bool had_key = fscrypt_has_encryption_key(dir); + int err; + + if (!IS_ENCRYPTED(dir)) + return 0; + + err = __fscrypt_prepare_readdir(dir); + if (err) + return err; + if (!had_key && fscrypt_has_encryption_key(dir)) { + /* directory just got unlocked, mark it as not complete */ + ceph_dir_clear_complete(dir); + return 1; + } + return 0; +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + + doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode, + ceph_vinop(inode), len, offs, lblk_num); + return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num); +} + +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + + doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode, + ceph_vinop(inode), len, offs, lblk_num); + return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num, + gfp_flags); +} + +/** + * ceph_fscrypt_decrypt_pages - decrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the read data starts + * @len: max length to decrypt + * + * Decrypt an array of fscrypt'ed pages and return the amount of + * data decrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored (and should + * probably be zeroed by the caller). + * + * Returns the length of the decrypted data or a negative errno. + */ +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Decrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_decrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} + +/** + * ceph_fscrypt_decrypt_extents: decrypt received extents in given buffer + * @inode: inode associated with pages being decrypted + * @page: pointer to page array + * @off: offset into the file that the data in page[0] starts + * @map: pointer to extent array + * @ext_cnt: length of extent array + * + * Given an extent map and a page array, decrypt the received data in-place, + * skipping holes. Returns the offset into buffer of end of last decrypted + * block. + */ +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + int i, ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objno, objoff; + u32 xlen; + + /* Nothing to do for empty array */ + if (ext_cnt == 0) { + doutc(cl, "%p %llx.%llx empty array, ret 0\n", inode, + ceph_vinop(inode)); + return 0; + } + + ceph_calc_file_object_mapping(&ci->i_layout, off, map[0].len, + &objno, &objoff, &xlen); + + for (i = 0; i < ext_cnt; ++i) { + struct ceph_sparse_extent *ext = &map[i]; + int pgsoff = ext->off - objoff; + int pgidx = pgsoff >> PAGE_SHIFT; + int fret; + + if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) { + pr_warn_client(cl, + "%p %llx.%llx bad encrypted sparse extent " + "idx %d off %llx len %llx\n", + inode, ceph_vinop(inode), i, ext->off, + ext->len); + return -EIO; + } + fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx], + off + pgsoff, ext->len); + doutc(cl, "%p %llx.%llx [%d] 0x%llx~0x%llx fret %d\n", inode, + ceph_vinop(inode), i, ext->off, ext->len, fret); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret = pgsoff + fret; + } + doutc(cl, "ret %d\n", ret); + return ret; +} + +/** + * ceph_fscrypt_encrypt_pages - encrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the data starts + * @len: max length to encrypt + * @gfp: gfp flags to use for allocation + * + * Decrypt an array of cleartext pages and return the amount of + * data encrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored. + * + * Returns the length of the encrypted data or a negative errno. + */ +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len, gfp_t gfp) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Encrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i, gfp); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h new file mode 100644 index 000000000000..47e0c319fc68 --- /dev/null +++ b/fs/ceph/crypto.h @@ -0,0 +1,288 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ceph fscrypt functionality + */ + +#ifndef _CEPH_CRYPTO_H +#define _CEPH_CRYPTO_H + +#include <crypto/sha2.h> +#include <linux/fscrypt.h> + +#define CEPH_FSCRYPT_BLOCK_SHIFT 12 +#define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT) +#define CEPH_FSCRYPT_BLOCK_MASK (~(CEPH_FSCRYPT_BLOCK_SIZE-1)) + +struct ceph_fs_client; +struct ceph_acl_sec_ctx; +struct ceph_mds_request; + +struct ceph_fname { + struct inode *dir; + char *name; // b64 encoded, possibly hashed + unsigned char *ctext; // binary crypttext (if any) + u32 name_len; // length of name buffer + u32 ctext_len; // length of crypttext + bool no_copy; +}; + +/* + * Header for the crypted file when truncating the size, this + * will be sent to MDS, and the MDS will update the encrypted + * last block and then truncate the size. + */ +struct ceph_fscrypt_truncate_size_header { + __u8 ver; + __u8 compat; + + /* + * It will be sizeof(assert_ver + file_offset + block_size) + * if the last block is empty when it's located in a file + * hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE. + */ + __le32 data_len; + + __le64 change_attr; + __le64 file_offset; + __le32 block_size; +} __packed; + +struct ceph_fscrypt_auth { + __le32 cfa_version; + __le32 cfa_blob_len; + u8 cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE]; +} __packed; + +#define CEPH_FSCRYPT_AUTH_VERSION 1 +static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa) +{ + u32 ctxsize = le32_to_cpu(fa->cfa_blob_len); + + return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize; +} + +#ifdef CONFIG_FS_ENCRYPTION +/* + * We want to encrypt filenames when creating them, but the encrypted + * versions of those names may have illegal characters in them. To mitigate + * that, we base64 encode them, but that gives us a result that can exceed + * NAME_MAX. + * + * Follow a similar scheme to fscrypt itself, and cap the filename to a + * smaller size. If the ciphertext name is longer than the value below, then + * sha256 hash the remaining bytes. + * + * For the fscrypt_nokey_name struct the dirhash[2] member is useless in ceph + * so the corresponding struct will be: + * + * struct fscrypt_ceph_nokey_name { + * u8 bytes[157]; + * u8 sha256[SHA256_DIGEST_SIZE]; + * }; // 180 bytes => 240 bytes base64-encoded, which is <= NAME_MAX (255) + * + * (240 bytes is the maximum size allowed for snapshot names to take into + * account the format: '_<SNAPSHOT-NAME>_<INODE-NUMBER>'.) + * + * Note that for long names that end up having their tail portion hashed, we + * must also store the full encrypted name (in the dentry's alternate_name + * field). + */ +#define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE) + +#define CEPH_BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + +int ceph_base64_encode(const u8 *src, int srclen, char *dst); +int ceph_base64_decode(const char *src, int srclen, u8 *dst); + +void ceph_fscrypt_set_ops(struct super_block *sb); + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc); + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as); +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as); +int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, + char *buf); +int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, + char *buf); + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (!IS_ENCRYPTED(parent)) + return 0; + return fscrypt_fname_alloc_buffer(NAME_MAX, fname); +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (IS_ENCRYPTED(parent)) + fscrypt_fname_free_buffer(fname); +} + +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey); +int ceph_fscrypt_prepare_readdir(struct inode *dir); + +static inline unsigned int ceph_fscrypt_blocks(u64 off, u64 len) +{ + /* crypto blocks cannot span more than one page */ + BUILD_BUG_ON(CEPH_FSCRYPT_BLOCK_SHIFT > PAGE_SHIFT); + + return ((off+len+CEPH_FSCRYPT_BLOCK_SIZE-1) >> CEPH_FSCRYPT_BLOCK_SHIFT) - + (off >> CEPH_FSCRYPT_BLOCK_SHIFT); +} + +/* + * If we have an encrypted inode then we must adjust the offset and + * range of the on-the-wire read to cover an entire encryption block. + * The copy will be done using the original offset and length, after + * we've decrypted the result. + */ +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ + if (IS_ENCRYPTED(inode)) { + *len = ceph_fscrypt_blocks(*off, *len) * CEPH_FSCRYPT_BLOCK_SIZE; + *off &= CEPH_FSCRYPT_BLOCK_MASK; + } +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num); +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags); +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len); +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt); +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len, gfp_t gfp); + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return fscrypt_is_bounce_page(page) ? fscrypt_pagecache_page(page) : page; +} + +#else /* CONFIG_FS_ENCRYPTION */ + +static inline void ceph_fscrypt_set_ops(struct super_block *sb) +{ +} + +static inline void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ +} + +static inline int ceph_fscrypt_prepare_context(struct inode *dir, + struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + if (IS_ENCRYPTED(dir)) + return -EOPNOTSUPP; + return 0; +} + +static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ +} + +static inline int ceph_encode_encrypted_dname(struct inode *parent, + struct qstr *d_name, char *buf) +{ + memcpy(buf, d_name->name, d_name->len); + return d_name->len; +} + +static inline int ceph_encode_encrypted_fname(struct inode *parent, + struct dentry *dentry, char *buf) +{ + return -EOPNOTSUPP; +} + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + return 0; +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ +} + +static inline int ceph_fname_to_usr(const struct ceph_fname *fname, + struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + oname->name = fname->name; + oname->len = fname->name_len; + return 0; +} + +static inline int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + return 0; +} + +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ +} + +static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_extents(struct inode *inode, + struct page **page, u64 off, + struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len, gfp_t gfp) +{ + return 0; +} + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return page; +} +#endif /* CONFIG_FS_ENCRYPTION */ + +static inline loff_t ceph_fscrypt_page_offset(struct page *page) +{ + return page_offset(ceph_fscrypt_pagecache_page(page)); +} + +#endif /* _CEPH_CRYPTO_H */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3904333fa6c3..24c08078f5aa 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -81,7 +81,7 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { - path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, &pathbase, 0); if (IS_ERR(path)) path = NULL; @@ -100,7 +100,7 @@ static int mdsc_show(struct seq_file *s, void *p) } if (req->r_old_dentry) { - path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen, &pathbase, 0); if (IS_ERR(path)) path = NULL; @@ -398,7 +398,7 @@ DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { - dout("ceph_fs_debugfs_cleanup\n"); + doutc(fsc->client, "begin\n"); debugfs_remove(fsc->debugfs_bdi); debugfs_remove(fsc->debugfs_congestion_kb); debugfs_remove(fsc->debugfs_mdsmap); @@ -407,13 +407,14 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_status); debugfs_remove(fsc->debugfs_mdsc); debugfs_remove_recursive(fsc->debugfs_metrics_dir); + doutc(fsc->client, "done\n"); } void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { char name[100]; - dout("ceph_fs_debugfs_init\n"); + doutc(fsc->client, "begin\n"); fsc->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 0600, @@ -469,6 +470,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) &metrics_size_fops); debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc, &metrics_caps_fops); + doutc(fsc->client, "done\n"); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index bdcffb04513f..91709934c8b1 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -9,6 +9,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -108,7 +109,9 @@ static int fpos_cmp(loff_t l, loff_t r) * regardless of what dir changes take place on the * server. */ -static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name, +static int note_last_dentry(struct ceph_fs_client *fsc, + struct ceph_dir_file_info *dfi, + const char *name, int len, unsigned next_offset) { char *buf = kmalloc(len+1, GFP_KERNEL); @@ -119,7 +122,7 @@ static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name, memcpy(dfi->last_name, name, len); dfi->last_name[len] = 0; dfi->next_offset = next_offset; - dout("note_last_dentry '%s'\n", dfi->last_name); + doutc(fsc->client, "'%s'\n", dfi->last_name); return 0; } @@ -129,6 +132,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, struct ceph_readdir_cache_control *cache_ctl) { struct inode *dir = d_inode(parent); + struct ceph_client *cl = ceph_inode_to_client(dir); struct dentry *dentry; unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; loff_t ptr_pos = idx * sizeof(struct dentry *); @@ -141,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, ceph_readdir_cache_release(cache_ctl); cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); if (!cache_ctl->page) { - dout(" page %lu not found\n", ptr_pgoff); + doutc(cl, " page %lu not found\n", ptr_pgoff); return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by @@ -184,13 +188,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct ceph_dir_file_info *dfi = file->private_data; struct dentry *parent = file->f_path.dentry; struct inode *dir = d_inode(parent); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(dir); + struct ceph_client *cl = ceph_inode_to_client(dir); struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; struct ceph_readdir_cache_control cache_ctl = {}; u64 idx = 0; int err = 0; - dout("__dcache_readdir %p v%u at %llx\n", dir, (unsigned)shared_gen, ctx->pos); + doutc(cl, "%p %llx.%llx v%u at %llx\n", dir, ceph_vinop(dir), + (unsigned)shared_gen, ctx->pos); /* search start position */ if (ctx->pos > 2) { @@ -220,7 +227,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, dput(dentry); } - dout("__dcache_readdir %p cache idx %llu\n", dir, idx); + doutc(cl, "%p %llx.%llx cache idx %llu\n", dir, + ceph_vinop(dir), idx); } @@ -241,7 +249,9 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, di = ceph_dentry(dentry); if (d_unhashed(dentry) || d_really_is_negative(dentry) || - di->lease_shared_gen != shared_gen) { + di->lease_shared_gen != shared_gen || + ((dentry->d_flags & DCACHE_NOKEY_NAME) && + fscrypt_has_encryption_key(dir))) { spin_unlock(&dentry->d_lock); dput(dentry); err = -EAGAIN; @@ -254,8 +264,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, spin_unlock(&dentry->d_lock); if (emit_dentry) { - dout(" %llx dentry %p %pd %p\n", di->offset, - dentry, dentry, d_inode(dentry)); + doutc(cl, " %llx dentry %p %pd %p\n", di->offset, + dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_present_inode(d_inode(dentry)), @@ -278,7 +288,8 @@ out: if (last) { int ret; di = ceph_dentry(last); - ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len, + ret = note_last_dentry(fsc, dfi, last->d_name.name, + last->d_name.len, fpos_off(di->offset) + 1); if (ret < 0) err = ret; @@ -307,20 +318,23 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_client *cl = fsc->client; int i; int err; unsigned frag = -1; struct ceph_mds_reply_info_parsed *rinfo; - dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); + doutc(cl, "%p %llx.%llx file %p pos %llx\n", inode, + ceph_vinop(inode), file, ctx->pos); if (dfi->file_info.flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ if (ctx->pos == 0) { - dout("readdir off 0 -> '.'\n"); + doutc(cl, "%p %llx.%llx off 0 -> '.'\n", inode, + ceph_vinop(inode)); if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode), inode->i_mode >> 12)) return 0; @@ -334,12 +348,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ino = ceph_present_inode(dentry->d_parent->d_inode); spin_unlock(&dentry->d_lock); - dout("readdir off 1 -> '..'\n"); + doutc(cl, "%p %llx.%llx off 1 -> '..'\n", inode, + ceph_vinop(inode)); if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12)) return 0; ctx->pos = 2; } + err = ceph_fscrypt_prepare_readdir(inode); + if (err < 0) + return err; + spin_lock(&ci->i_ceph_lock); /* request Fx cap. if have Fx, we don't need to release Fs cap * for later create/unlink. */ @@ -384,11 +403,12 @@ more: frag = fpos_frag(ctx->pos); } - dout("readdir fetching %llx.%llx frag %x offset '%s'\n", - ceph_vinop(inode), frag, dfi->last_name); + doutc(cl, "fetching %p %llx.%llx frag %x offset '%s'\n", + inode, ceph_vinop(inode), frag, dfi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); if (err) { ceph_mdsc_put_request(req); @@ -402,11 +422,21 @@ more: req->r_inode_drop = CEPH_CAP_FILE_EXCL; } if (dfi->last_name) { - req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL); + struct qstr d_name = { .name = dfi->last_name, + .len = strlen(dfi->last_name) }; + + req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; } + + err = ceph_encode_encrypted_dname(inode, &d_name, + req->r_path2); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } } else if (is_hash_order(ctx->pos)) { req->r_args.readdir.offset_hash = cpu_to_le32(fpos_hash(ctx->pos)); @@ -428,12 +458,12 @@ more: ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d on " - "frag %x, end=%d, complete=%d, hash_order=%d\n", - err, frag, - (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete, - (int)req->r_reply_info.hash_order); + doutc(cl, "%p %llx.%llx got and parsed readdir result=%d" + "on frag %x, end=%d, complete=%d, hash_order=%d\n", + inode, ceph_vinop(inode), err, frag, + (int)req->r_reply_info.dir_end, + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { @@ -463,7 +493,8 @@ more: dfi->dir_ordered_count = req->r_dir_ordered_cnt; } } else { - dout("readdir !did_prepopulate\n"); + doutc(cl, "%p %llx.%llx !did_prepopulate\n", inode, + ceph_vinop(inode)); /* disable readdir cache */ dfi->readdir_cache_idx = -1; /* preclude from marking dir complete */ @@ -476,8 +507,8 @@ more: rinfo->dir_entries + (rinfo->dir_nr-1); unsigned next_offset = req->r_reply_info.dir_end ? 2 : (fpos_off(rde->offset) + 1); - err = note_last_dentry(dfi, rde->name, rde->name_len, - next_offset); + err = note_last_dentry(fsc, dfi, rde->name, + rde->name_len, next_offset); if (err) { ceph_mdsc_put_request(dfi->last_readdir); dfi->last_readdir = NULL; @@ -490,9 +521,9 @@ more: } rinfo = &dfi->last_readdir->r_reply_info; - dout("readdir frag %x num %d pos %llx chunk first %llx\n", - dfi->frag, rinfo->dir_nr, ctx->pos, - rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); + doutc(cl, "%p %llx.%llx frag %x num %d pos %llx chunk first %llx\n", + inode, ceph_vinop(inode), dfi->frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); i = 0; /* search start position */ @@ -511,14 +542,20 @@ more: for (; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - BUG_ON(rde->offset < ctx->pos); + if (rde->offset < ctx->pos) { + pr_warn_client(cl, + "%p %llx.%llx rde->offset 0x%llx ctx->pos 0x%llx\n", + inode, ceph_vinop(inode), rde->offset, ctx->pos); + return -EIO; + } - ctx->pos = rde->offset; - dout("readdir (%d/%d) -> %llx '%.*s' %p\n", - i, rinfo->dir_nr, ctx->pos, - rde->name_len, rde->name, &rde->inode.in); + if (WARN_ON_ONCE(!rde->inode.in)) + return -EIO; - BUG_ON(!rde->inode.in); + ctx->pos = rde->offset; + doutc(cl, "%p %llx.%llx (%d/%d) -> %llx '%.*s' %p\n", inode, + ceph_vinop(inode), i, rinfo->dir_nr, ctx->pos, + rde->name_len, rde->name, &rde->inode.in); if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), @@ -529,9 +566,11 @@ more: * doesn't have enough memory, etc. So for next readdir * it will continue. */ - dout("filldir stopping us...\n"); + doutc(cl, "filldir stopping us...\n"); return 0; } + + /* Reset the lengths to their original allocated vals */ ctx->pos++; } @@ -558,7 +597,8 @@ more: kfree(dfi->last_name); dfi->last_name = NULL; } - dout("readdir next frag is %x\n", frag); + doutc(cl, "%p %llx.%llx next frag is %x\n", inode, + ceph_vinop(inode), frag); goto more; } dfi->file_info.flags |= CEPH_F_ATEND; @@ -573,21 +613,23 @@ more: spin_lock(&ci->i_ceph_lock); if (dfi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { - dout(" marking %p complete and ordered\n", inode); + doutc(cl, " marking %p %llx.%llx complete and ordered\n", + inode, ceph_vinop(inode)); /* use i_size to track number of entries in * readdir cache */ BUG_ON(dfi->readdir_cache_idx < 0); i_size_write(inode, dfi->readdir_cache_idx * sizeof(struct dentry*)); } else { - dout(" marking %p complete\n", inode); + doutc(cl, " marking %llx.%llx complete\n", + ceph_vinop(inode)); } __ceph_dir_set_complete(ci, dfi->dir_release_count, dfi->dir_ordered_count); spin_unlock(&ci->i_ceph_lock); } - - dout("readdir %p file %p done.\n", inode, file); + doutc(cl, "%p %llx.%llx file %p done.\n", inode, ceph_vinop(inode), + file); return 0; } @@ -633,6 +675,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_dir_file_info *dfi = file->private_data; struct inode *inode = file->f_mapping->host; + struct ceph_client *cl = ceph_inode_to_client(inode); loff_t retval; inode_lock(inode); @@ -652,7 +695,8 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) if (offset >= 0) { if (need_reset_readdir(dfi, offset)) { - dout("dir_llseek dropping %p content\n", file); + doutc(cl, "%p %llx.%llx dropping %p content\n", + inode, ceph_vinop(inode), file); reset_readdir(dfi); } else if (is_hash_order(offset) && offset > file->f_pos) { /* for hash offset, we don't know if a forward seek @@ -679,8 +723,9 @@ out: struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */ + struct ceph_client *cl = ceph_inode_to_client(parent); /* .snap dir? */ if (ceph_snap(parent) == CEPH_NOSNAP && @@ -689,8 +734,9 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct inode *inode = ceph_get_snapdir(parent); res = d_splice_alias(inode, dentry); - dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n", - dentry, dentry, inode, res); + doutc(cl, "ENOENT on snapdir %p '%pd', linking to " + "snapdir %p %llx.%llx. Spliced dentry %p\n", + dentry, dentry, inode, ceph_vinop(inode), res); if (res) dentry = res; } @@ -711,12 +757,15 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { + struct ceph_client *cl = req->r_mdsc->fsc->client; + if (err == -ENOENT) { /* no trace? */ err = 0; if (!req->r_reply_info.head->is_dentry) { - dout("ENOENT and no trace, dentry %p inode %p\n", - dentry, d_inode(dentry)); + doutc(cl, + "ENOENT and no trace, dentry %p inode %llx.%llx\n", + dentry, ceph_vinop(d_inode(dentry))); if (d_really_is_positive(dentry)) { d_drop(dentry); err = -ENOENT; @@ -747,26 +796,40 @@ static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_request *req; int op; int mask; int err; - dout("lookup %p dentry %p '%pd'\n", - dir, dentry, dentry); + doutc(cl, "%p %llx.%llx/'%pd' dentry %p\n", dir, ceph_vinop(dir), + dentry, dentry); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); + if (IS_ENCRYPTED(dir)) { + bool had_key = fscrypt_has_encryption_key(dir); + + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + return ERR_PTR(err); + + /* mark directory as incomplete if it has been unlocked */ + if (!had_key && fscrypt_has_encryption_key(dir)) + ceph_dir_clear_complete(dir); + } + /* can we conclude ENOENT locally? */ if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&ci->i_ceph_lock); - dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags); + doutc(cl, " dir %llx.%llx flags are 0x%lx\n", + ceph_vinop(dir), ci->i_ceph_flags); if (strncmp(dentry->d_name.name, fsc->mount_options->snapdir_name, dentry->d_name.len) && @@ -776,7 +839,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, __ceph_caps_issued_mask_metric(ci, CEPH_CAP_FILE_SHARED, 1)) { __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); spin_unlock(&ci->i_ceph_lock); - dout(" dir %p complete, -ENOENT\n", dir); + doutc(cl, " dir %llx.%llx complete, -ENOENT\n", + ceph_vinop(dir)); d_add(dentry, NULL); di->lease_shared_gen = atomic_read(&ci->i_shared_gen); return NULL; @@ -814,7 +878,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, } dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ - dout("lookup result=%p\n", dentry); + doutc(cl, "result=%p\n", dentry); return dentry; } @@ -849,6 +913,7 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err; @@ -865,37 +930,42 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out; } - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - goto out; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) - goto out; - - dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", - dir, dentry, mode, rdev); + doutc(cl, "%p %llx.%llx/'%pd' dentry %p mode 0%ho rdev %d\n", + dir, ceph_vinop(dir), dentry, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + + if (S_ISREG(mode) && IS_ENCRYPTED(dir)) + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_mnt_idmap = mnt_idmap_get(idmap); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (!err) @@ -912,12 +982,51 @@ static int ceph_create(struct mnt_idmap *idmap, struct inode *dir, return ceph_mknod(idmap, dir, dentry, mode, 0); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) +{ + int err; + int len = strlen(dest); + struct fscrypt_str osd_link = FSTR_INIT(NULL, 0); + + err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX, + &osd_link); + if (err) + goto out; + + err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link); + if (err) + goto out; + + req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out; + } + + len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2); + req->r_path2[len] = '\0'; +out: + fscrypt_fname_free_buffer(&osd_link); + return err; +} +#else +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) +{ + return -EOPNOTSUPP; +} +#endif + static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; + umode_t mode = S_IFLNK | 0777; int err; if (ceph_snap(dir) != CEPH_NOSNAP) @@ -932,38 +1041,50 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } - err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx); - if (err < 0) - goto out; - - dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); + doutc(cl, "%p %llx.%llx/'%pd' to '%s'\n", dir, ceph_vinop(dir), dentry, + dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_path2 = kstrdup(dest, GFP_KERNEL); - if (!req->r_path2) { - err = -ENOMEM; - ceph_mdsc_put_request(req); - goto out; + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; } + req->r_parent = dir; ihold(dir); + if (IS_ENCRYPTED(req->r_new_inode)) { + err = prep_encrypted_symlink_target(req, dest); + if (err) + goto out_req; + } else { + req->r_path2 = kstrdup(dest, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out_req; + } + } + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_mnt_idmap = mnt_idmap_get(idmap); req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (err) @@ -976,6 +1097,7 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err; @@ -988,10 +1110,11 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ op = CEPH_MDS_OP_MKSNAP; - dout("mksnap dir %p snap '%pd' dn %p\n", dir, - dentry, dentry); + doutc(cl, "mksnap %llx.%llx/'%pd' dentry %p\n", + ceph_vinop(dir), dentry, dentry); } else if (ceph_snap(dir) == CEPH_NOSNAP) { - dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); + doutc(cl, "mkdir %llx.%llx/'%pd' dentry %p mode 0%ho\n", + ceph_vinop(dir), dentry, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { err = -EROFS; @@ -1003,14 +1126,12 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = -EDQUOT; goto out; } - - mode |= S_IFDIR; - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - goto out; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) + if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && + !fscrypt_has_encryption_key(dir)) { + err = -ENOKEY; goto out; + } + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -1018,24 +1139,34 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto out; } + mode |= S_IFDIR; + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + if (op == CEPH_MDS_OP_MKDIR) + req->r_mnt_idmap = mnt_idmap_get(idmap); req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_target && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (!err) @@ -1050,6 +1181,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; int err; @@ -1063,8 +1195,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n", - dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry); + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + + doutc(cl, "%p %llx.%llx/'%pd' to '%pd'\n", dir, ceph_vinop(dir), + old_dentry, dentry); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); if (IS_ERR(req)) { d_drop(dentry); @@ -1101,14 +1237,16 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { struct dentry *dentry = req->r_dentry; - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); + struct ceph_client *cl = fsc->client; struct ceph_dentry_info *di = ceph_dentry(dentry); int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) - pr_warn("%s dentry %p:%pd async unlink bit is not set\n", - __func__, dentry, dentry); + pr_warn_client(cl, + "dentry %p:%pd async unlink bit is not set\n", + dentry, dentry); spin_lock(&fsc->async_unlink_conflict_lock); hash_del_rcu(&di->hnode); @@ -1128,7 +1266,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, if (result) { int pathlen = 0; u64 base = 0; - char *path = ceph_mdsc_build_path(dentry, &pathlen, + char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &base, 0); /* mark error on parent + clear complete */ @@ -1142,8 +1280,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, /* mark inode itself for an error (since metadata is bogus) */ mapping_set_error(req->r_old_inode->i_mapping, result); - pr_warn("async unlink failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<<bad>>" : path, result); + pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); ceph_mdsc_free_path(path, pathlen); } out: @@ -1192,7 +1330,8 @@ static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry) */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; @@ -1202,11 +1341,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ - dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry); + doutc(cl, "rmsnap %llx.%llx/'%pd' dn\n", ceph_vinop(dir), + dentry); op = CEPH_MDS_OP_RMSNAP; } else if (ceph_snap(dir) == CEPH_NOSNAP) { - dout("unlink/rmdir dir %p dn %p inode %p\n", - dir, dentry, inode); + doutc(cl, "unlink/rmdir %llx.%llx/'%pd' inode %llx.%llx\n", + ceph_vinop(dir), dentry, ceph_vinop(inode)); op = d_is_dir(dentry) ? CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else @@ -1229,9 +1369,9 @@ retry: (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { struct ceph_dentry_info *di = ceph_dentry(dentry); - dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), - dentry->d_name.len, dentry->d_name.name, - ceph_cap_string(req->r_dir_caps)); + doutc(cl, "async unlink on %llx.%llx/'%pd' caps=%s", + ceph_vinop(dir), dentry, + ceph_cap_string(req->r_dir_caps)); set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); req->r_callback = ceph_async_unlink_cb; req->r_old_inode = d_inode(dentry); @@ -1286,6 +1426,7 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *new_dentry, unsigned int flags) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; int op = CEPH_MDS_OP_RENAME; int err; @@ -1310,8 +1451,14 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (err) return err; - dout("rename dir %p dentry %p to dir %p dentry %p\n", - old_dir, old_dentry, new_dir, new_dentry); + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + + doutc(cl, "%llx.%llx/'%pd' to %llx.%llx/'%pd'\n", + ceph_vinop(old_dir), old_dentry, ceph_vinop(new_dir), + new_dentry); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); @@ -1356,9 +1503,10 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, void __ceph_dentry_lease_touch(struct ceph_dentry_info *di) { struct dentry *dn = di->dentry; - struct ceph_mds_client *mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; - dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn); + doutc(cl, "%p %p '%pd'\n", di, dn, dn); di->flags |= CEPH_DENTRY_LEASE_LIST; if (di->flags & CEPH_DENTRY_SHRINK_LIST) { @@ -1366,7 +1514,6 @@ void __ceph_dentry_lease_touch(struct ceph_dentry_info *di) return; } - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_list_lock); list_move_tail(&di->lease_list, &mdsc->dentry_leases); spin_unlock(&mdsc->dentry_list_lock); @@ -1390,10 +1537,10 @@ static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc, void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) { struct dentry *dn = di->dentry; - struct ceph_mds_client *mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dn->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; - dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n", - di, dn, dn, di->offset); + doutc(cl, "%p %p '%pd' (offset 0x%llx)\n", di, dn, dn, di->offset); if (!list_empty(&di->lease_list)) { if (di->flags & CEPH_DENTRY_LEASE_LIST) { @@ -1413,7 +1560,6 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) return; } - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_list_lock); __dentry_dir_lease_touch(mdsc, di), spin_unlock(&mdsc->dentry_list_lock); @@ -1427,7 +1573,7 @@ static void __dentry_lease_unlist(struct ceph_dentry_info *di) if (list_empty(&di->lease_list)) return; - mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc; + mdsc = ceph_sb_to_fs_client(di->dentry->d_sb)->mdsc; spin_lock(&mdsc->dentry_list_lock); list_del_init(&di->lease_list); spin_unlock(&mdsc->dentry_list_lock); @@ -1654,6 +1800,8 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags) { struct ceph_dentry_info *di; struct ceph_mds_session *session = NULL; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; u32 seq = 0; int valid = 0; @@ -1686,7 +1834,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags) CEPH_MDS_LEASE_RENEW, seq); ceph_put_mds_session(session); } - dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); + doutc(cl, "dentry %p = %d\n", dentry, valid); return valid; } @@ -1729,6 +1877,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, struct ceph_mds_client *mdsc) { struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_client *cl = mdsc->fsc->client; int valid; int shared_gen; @@ -1750,8 +1899,9 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, valid = 0; spin_unlock(&dentry->d_lock); } - dout("dir_lease_is_valid dir %p v%u dentry %p = %d\n", - dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, valid); + doutc(cl, "dir %p %llx.%llx v%u dentry %p '%pd' = %d\n", dir, + ceph_vinop(dir), (unsigned)atomic_read(&ci->i_shared_gen), + dentry, dentry, valid); return valid; } @@ -1760,10 +1910,15 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, */ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) { + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(dentry->d_sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; int valid = 0; struct dentry *parent; struct inode *dir, *inode; - struct ceph_mds_client *mdsc; + + valid = fscrypt_d_revalidate(dentry, flags); + if (valid <= 0) + return valid; if (flags & LOOKUP_RCU) { parent = READ_ONCE(dentry->d_parent); @@ -1777,15 +1932,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); } - dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, - dentry, inode, ceph_dentry(dentry)->offset); + doutc(cl, "%p '%pd' inode %p offset 0x%llx nokey %d\n", + dentry, dentry, inode, ceph_dentry(dentry)->offset, + !!(dentry->d_flags & DCACHE_NOKEY_NAME)); - mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; + mdsc = ceph_sb_to_fs_client(dir->i_sb)->mdsc; /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { - dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, - dentry, inode); + doutc(cl, "%p '%pd' inode %p is SNAPPED\n", dentry, + dentry, inode); valid = 1; } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { valid = 1; @@ -1840,14 +1996,14 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) break; } ceph_mdsc_put_request(req); - dout("d_revalidate %p lookup result=%d\n", - dentry, err); + doutc(cl, "%p '%pd', lookup result=%d\n", dentry, + dentry, err); } } else { percpu_counter_inc(&mdsc->metric.d_lease_hit); } - dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + doutc(cl, "%p '%pd' %s\n", dentry, dentry, valid ? "valid" : "invalid"); if (!valid) ceph_dir_clear_complete(dir); @@ -1887,9 +2043,9 @@ static int ceph_d_delete(const struct dentry *dentry) static void ceph_d_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); - dout("d_release %p\n", dentry); + doutc(fsc->client, "dentry %p '%pd'\n", dentry, dentry); atomic64_dec(&fsc->mdsc->metric.total_dentries); @@ -1910,10 +2066,12 @@ static void ceph_d_release(struct dentry *dentry) */ static void ceph_d_prune(struct dentry *dentry) { + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *dir_ci; struct ceph_dentry_info *di; - dout("ceph_d_prune %pd %p\n", dentry, dentry); + doutc(cl, "dentry %p '%pd'\n", dentry, dentry); /* do we have a valid parent? */ if (IS_ROOT(dentry)) @@ -1956,7 +2114,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, int left; const int bufsize = 1024; - if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_mount_opt(ceph_sb_to_fs_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!dfi->dir_info) { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f780e4e0d062..726af69d4d62 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -7,6 +7,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" /* * Basic fh @@ -35,6 +36,7 @@ struct ceph_nfs_snapfh { static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); static const int snap_handle_length = sizeof(struct ceph_nfs_snapfh) >> 2; struct ceph_nfs_snapfh *sfh = (void *)rawfh; @@ -78,13 +80,14 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, *max_len = snap_handle_length; ret = FILEID_BTRFS_WITH_PARENT; out: - dout("encode_snapfh %llx.%llx ret=%d\n", ceph_vinop(inode), ret); + doutc(cl, "%p %llx.%llx ret=%d\n", inode, ceph_vinop(inode), ret); return ret; } static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); static const int handle_length = sizeof(struct ceph_nfs_fh) >> 2; static const int connected_handle_length = @@ -104,15 +107,15 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, if (parent_inode) { struct ceph_nfs_confh *cfh = (void *)rawfh; - dout("encode_fh %llx with parent %llx\n", - ceph_ino(inode), ceph_ino(parent_inode)); + doutc(cl, "%p %llx.%llx with parent %p %llx.%llx\n", inode, + ceph_vinop(inode), parent_inode, ceph_vinop(parent_inode)); cfh->ino = ceph_ino(inode); cfh->parent_ino = ceph_ino(parent_inode); *max_len = connected_handle_length; type = FILEID_INO32_GEN_PARENT; } else { struct ceph_nfs_fh *fh = (void *)rawfh; - dout("encode_fh %llx\n", ceph_ino(inode)); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); fh->ino = ceph_ino(inode); *max_len = handle_length; type = FILEID_INO32_GEN; @@ -122,7 +125,7 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, static struct inode *__lookup_inode(struct super_block *sb, u64 ino) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc; struct inode *inode; struct ceph_vino vino; int err; @@ -204,7 +207,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, struct ceph_nfs_snapfh *sfh, bool want_parent) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct inode *inode; struct ceph_vino vino; @@ -277,11 +281,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, ceph_mdsc_put_request(req); if (want_parent) { - dout("snapfh_to_parent %llx.%llx\n err=%d\n", - vino.ino, vino.snap, err); + doutc(cl, "%llx.%llx\n err=%d\n", vino.ino, vino.snap, err); } else { - dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d", - vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); + doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino, + vino.snap, sfh->parent_ino, sfh->hash, err); } if (IS_ERR(inode)) return ERR_CAST(inode); @@ -296,6 +299,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_nfs_fh *fh = (void *)fid->raw; if (fh_type == FILEID_BTRFS_WITH_PARENT) { @@ -309,14 +313,14 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb, if (fh_len < sizeof(*fh) / 4) return NULL; - dout("fh_to_dentry %llx\n", fh->ino); + doutc(fsc->client, "%llx\n", fh->ino); return __fh_to_dentry(sb, fh->ino); } static struct dentry *__get_parent(struct super_block *sb, struct dentry *child, u64 ino) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(sb)->mdsc; struct ceph_mds_request *req; struct inode *inode; int mask; @@ -362,6 +366,7 @@ static struct dentry *__get_parent(struct super_block *sb, static struct dentry *ceph_get_parent(struct dentry *child) { struct inode *inode = d_inode(child); + struct ceph_client *cl = ceph_inode_to_client(inode); struct dentry *dn; if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -401,8 +406,8 @@ static struct dentry *ceph_get_parent(struct dentry *child) dn = __get_parent(child->d_sb, child, 0); } out: - dout("get_parent %p ino %llx.%llx err=%ld\n", - child, ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn)); + doutc(cl, "child %p %p %llx.%llx err=%ld\n", child, inode, + ceph_vinop(inode), (long)PTR_ERR_OR_ZERO(dn)); return dn; } @@ -413,6 +418,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_nfs_confh *cfh = (void *)fid->raw; struct dentry *dentry; @@ -426,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, if (fh_len < sizeof(*cfh) / 4) return NULL; - dout("fh_to_parent %llx\n", cfh->parent_ino); + doutc(fsc->client, "%llx\n", cfh->parent_ino); dentry = __get_parent(sb, NULL, cfh->ino); if (unlikely(dentry == ERR_PTR(-ENOENT))) dentry = __fh_to_dentry(sb, cfh->parent_ino); @@ -438,7 +444,7 @@ static int __get_snap_name(struct dentry *parent, char *name, { struct inode *inode = d_inode(child); struct inode *dir = d_inode(parent); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_request *req = NULL; char *last_name = NULL; unsigned next_offset = 2; @@ -525,8 +531,8 @@ out: if (req) ceph_mdsc_put_request(req); kfree(last_name); - dout("get_snap_name %p ino %llx.%llx err=%d\n", - child, ceph_vinop(inode), err); + doutc(fsc->client, "child dentry %p %p %llx.%llx err=%d\n", child, + inode, ceph_vinop(inode), err); return err; } @@ -535,42 +541,61 @@ static int ceph_get_name(struct dentry *parent, char *name, { struct ceph_mds_client *mdsc; struct ceph_mds_request *req; + struct inode *dir = d_inode(parent); struct inode *inode = d_inode(child); + struct ceph_mds_reply_info_parsed *rinfo; int err; if (ceph_snap(inode) != CEPH_NOSNAP) return __get_snap_name(parent, name, child); - mdsc = ceph_inode_to_client(inode)->mdsc; + mdsc = ceph_inode_to_fs_client(inode)->mdsc; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - inode_lock(d_inode(parent)); - + inode_lock(dir); req->r_inode = inode; ihold(inode); req->r_ino2 = ceph_vino(d_inode(parent)); - req->r_parent = d_inode(parent); - ihold(req->r_parent); + req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); + inode_unlock(dir); - inode_unlock(d_inode(parent)); + if (err) + goto out; - if (!err) { - struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + rinfo = &req->r_reply_info; + if (!IS_ENCRYPTED(dir)) { memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; - dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(inode), name); } else { - dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(inode), err); - } + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; + + err = ceph_fname_alloc_buffer(dir, &oname); + if (err < 0) + goto out; + err = ceph_fname_to_usr(&fname, NULL, &oname, NULL); + if (!err) { + memcpy(name, oname.name, oname.len); + name[oname.len] = 0; + } + ceph_fname_free_buffer(dir, &oname); + } +out: + doutc(mdsc->fsc->client, "child dentry %p %p %llx.%llx err %d %s%s\n", + child, inode, ceph_vinop(inode), err, err ? "" : "name ", + err ? "" : name); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 63efe5389783..3b5aae29e944 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -19,8 +19,9 @@ #include "io.h" #include "metric.h" -static __le32 ceph_flags_sys2wire(u32 flags) +static __le32 ceph_flags_sys2wire(struct ceph_mds_client *mdsc, u32 flags) { + struct ceph_client *cl = mdsc->fsc->client; u32 wire_flags = 0; switch (flags & O_ACCMODE) { @@ -48,7 +49,7 @@ static __le32 ceph_flags_sys2wire(u32 flags) #undef ceph_sys2wire if (flags) - dout("unused open flags: %x\n", flags); + doutc(cl, "unused open flags: %x\n", flags); return cpu_to_le32(wire_flags); } @@ -189,7 +190,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) if (IS_ERR(req)) goto out; req->r_fmode = ceph_flags_to_mode(flags); - req->r_args.open.flags = ceph_flags_sys2wire(flags); + req->r_args.open.flags = ceph_flags_sys2wire(mdsc, flags); req->r_args.open.mode = cpu_to_le32(create_mode); out: return req; @@ -200,12 +201,13 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mount_options *opt = - ceph_inode_to_client(&ci->netfs.inode)->mount_options; + ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_file_info *fi; int ret; - dout("%s %p %p 0%o (%s)\n", __func__, inode, file, - inode->i_mode, isdir ? "dir" : "regular"); + doutc(cl, "%p %llx.%llx %p 0%o (%s)\n", inode, ceph_vinop(inode), + file, inode->i_mode, isdir ? "dir" : "regular"); BUG_ON(inode->i_fop->release != ceph_release); if (isdir) { @@ -234,7 +236,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); - fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); + fi->filp_gen = READ_ONCE(ceph_inode_to_fs_client(inode)->filp_gen); if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { ret = ceph_uninline_data(file); @@ -259,6 +261,7 @@ error: */ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { + struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; switch (inode->i_mode & S_IFMT) { @@ -271,13 +274,13 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) break; case S_IFLNK: - dout("init_file %p %p 0%o (symlink)\n", inode, file, - inode->i_mode); + doutc(cl, "%p %llx.%llx %p 0%o (symlink)\n", inode, + ceph_vinop(inode), file, inode->i_mode); break; default: - dout("init_file %p %p 0%o (special)\n", inode, file, - inode->i_mode); + doutc(cl, "%p %llx.%llx %p 0%o (special)\n", inode, + ceph_vinop(inode), file, inode->i_mode); /* * we need to drop the open ref now, since we don't * have .release set to ceph_release. @@ -296,6 +299,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) int ceph_renew_caps(struct inode *inode, int fmode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; int err, flags, wanted; @@ -307,8 +311,9 @@ int ceph_renew_caps(struct inode *inode, int fmode) (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { int issued = __ceph_caps_issued(ci, NULL); spin_unlock(&ci->i_ceph_lock); - dout("renew caps %p want %s issued %s updating mds_wanted\n", - inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + doutc(cl, "%p %llx.%llx want %s issued %s updating mds_wanted\n", + inode, ceph_vinop(inode), ceph_cap_string(wanted), + ceph_cap_string(issued)); ceph_check_caps(ci, 0); return 0; } @@ -339,7 +344,8 @@ int ceph_renew_caps(struct inode *inode, int fmode) err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); out: - dout("renew caps %p open result=%d\n", inode, err); + doutc(cl, "%p %llx.%llx open result=%d\n", inode, ceph_vinop(inode), + err); return err < 0 ? err : 0; } @@ -352,7 +358,8 @@ out: int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *fi = file->private_data; @@ -360,17 +367,22 @@ int ceph_open(struct inode *inode, struct file *file) int flags, fmode, wanted; if (fi) { - dout("open file %p is already opened\n", file); + doutc(cl, "file %p is already opened\n", file); return 0; } /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ flags = file->f_flags & ~(O_CREAT|O_EXCL); - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { flags = O_DIRECTORY; /* mds likes to know */ + } else if (S_ISREG(inode->i_mode)) { + err = fscrypt_file_open(inode, file); + if (err) + return err; + } - dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, - ceph_vinop(inode), file, flags, file->f_flags); + doutc(cl, "%p %llx.%llx file %p flags %d (%d)\n", inode, + ceph_vinop(inode), file, flags, file->f_flags); fmode = ceph_flags_to_mode(flags); wanted = ceph_caps_for_mode(fmode); @@ -394,9 +406,9 @@ int ceph_open(struct inode *inode, struct file *file) int mds_wanted = __ceph_caps_mds_wanted(ci, true); int issued = __ceph_caps_issued(ci, NULL); - dout("open %p fmode %d want %s issued %s using existing\n", - inode, fmode, ceph_cap_string(wanted), - ceph_cap_string(issued)); + doutc(cl, "open %p fmode %d want %s issued %s using existing\n", + inode, fmode, ceph_cap_string(wanted), + ceph_cap_string(issued)); __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); @@ -416,7 +428,7 @@ int ceph_open(struct inode *inode, struct file *file) spin_unlock(&ci->i_ceph_lock); - dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); + doutc(cl, "open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); req = prepare_open_request(inode->i_sb, flags, 0); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -430,7 +442,7 @@ int ceph_open(struct inode *inode, struct file *file) if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); - dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); + doutc(cl, "open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); out: return err; } @@ -510,6 +522,7 @@ no_async: static void restore_deleg_ino(struct inode *dir, u64 ino) { + struct ceph_client *cl = ceph_inode_to_client(dir); struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_session *s = NULL; @@ -520,7 +533,8 @@ static void restore_deleg_ino(struct inode *dir, u64 ino) if (s) { int err = ceph_restore_deleg_ino(s, ino); if (err) - pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", + pr_warn_client(cl, + "unable to restore delegated ino 0x%llx to session: %d\n", ino, err); ceph_put_mds_session(s); } @@ -552,6 +566,7 @@ static void wake_async_create_waiters(struct inode *inode, static void ceph_async_create_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct ceph_client *cl = mdsc->fsc->client; struct dentry *dentry = req->r_dentry; struct inode *dinode = d_inode(dentry); struct inode *tinode = req->r_target_inode; @@ -569,10 +584,11 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, if (result) { int pathlen = 0; u64 base = 0; - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen, &base, 0); - pr_warn("async create failure path=(%llx)%s result=%d!\n", + pr_warn_client(cl, + "async create failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<<bad>>" : path, result); ceph_mdsc_free_path(path, pathlen); @@ -591,20 +607,22 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, u64 ino = ceph_vino(tinode).ino; if (req->r_deleg_ino != ino) - pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", - __func__, req->r_err, req->r_deleg_ino, ino); + pr_warn_client(cl, + "inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", + req->r_err, req->r_deleg_ino, ino); mapping_set_error(tinode->i_mapping, result); wake_async_create_waiters(tinode, req->r_session); } else if (!result) { - pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, - req->r_deleg_ino); + pr_warn_client(cl, "no req->r_target_inode for 0x%llx\n", + req->r_deleg_ino); } out: ceph_mdsc_release_dir_caps(req); } -static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, +static int ceph_finish_async_create(struct inode *dir, struct inode *inode, + struct dentry *dentry, struct file *file, umode_t mode, struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as_ctx, @@ -616,19 +634,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_mds_reply_info_in iinfo = { .in = &in }; struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *inode; struct timespec64 now; struct ceph_string *pool_ns; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_vino vino = { .ino = req->r_deleg_ino, .snap = CEPH_NOSNAP }; ktime_get_real_ts64(&now); - inode = ceph_get_inode(dentry->d_sb, vino); - if (IS_ERR(inode)) - return PTR_ERR(inode); - iinfo.inline_version = CEPH_INLINE_NONE; iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); @@ -654,7 +668,9 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, in.truncate_seq = cpu_to_le32(1); in.truncate_size = cpu_to_le64(-1ULL); in.xattr_version = cpu_to_le64(1); - in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); + in.uid = cpu_to_le32(from_kuid(&init_user_ns, + mapped_fsuid(req->r_mnt_idmap, + &init_user_ns))); if (dir->i_mode & S_ISGID) { in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); @@ -662,7 +678,9 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, if (S_ISDIR(mode)) mode |= S_ISGID; } else { - in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); + in.gid = cpu_to_le32(from_kgid(&init_user_ns, + mapped_fsgid(req->r_mnt_idmap, + &init_user_ns))); } in.mode = cpu_to_le32((u32)mode); @@ -682,17 +700,16 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, req->r_fmode, NULL); up_read(&mdsc->snap_rwsem); if (ret) { - dout("%s failed to fill inode: %d\n", __func__, ret); + doutc(cl, "failed to fill inode: %d\n", ret); ceph_dir_clear_complete(dir); if (!d_unhashed(dentry)) d_drop(dentry); - if (inode->i_state & I_NEW) - discard_new_inode(inode); + discard_new_inode(inode); } else { struct dentry *dn; - dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, - vino.ino, ceph_ino(dir), dentry->d_name.name); + doutc(cl, "d_adding new inode 0x%llx to 0x%llx/%s\n", + vino.ino, ceph_ino(dir), dentry->d_name.name); ceph_dir_clear_ordered(dir); ceph_init_inode_acls(inode, as_ctx); if (inode->i_state & I_NEW) { @@ -730,18 +747,21 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dir->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct inode *new_inode = NULL; struct dentry *dn; struct ceph_acl_sec_ctx as_ctx = {}; bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); int mask; int err; - dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", - dir, dentry, dentry, - d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n", + dir, ceph_vinop(dir), dentry, dentry, + d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); if (dentry->d_name.len > NAME_MAX) return -ENAMETOOLONG; @@ -755,15 +775,16 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, */ flags &= ~O_TRUNC; +retry: if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) return -EDQUOT; - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - return err; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) + + new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); goto out_ctx; + } /* Async create can't handle more than a page of xattrs */ if (as_ctx.pagelist && !list_is_singular(&as_ctx.pagelist->head)) @@ -772,7 +793,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, /* If it's not being looked up, it's negative */ return -ENOENT; } -retry: + /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) { @@ -786,7 +807,15 @@ retry: mask |= CEPH_CAP_XATTR_SHARED; req->r_args.open.mask = cpu_to_le32(mask); req->r_parent = dir; + if (req->r_op == CEPH_MDS_OP_CREATE) + req->r_mnt_idmap = mnt_idmap_get(idmap); ihold(dir); + if (IS_ENCRYPTED(dir)) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + goto out_req; + } if (flags & O_CREAT) { struct ceph_file_layout lo; @@ -794,32 +823,47 @@ retry: req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } - if (try_async && - (req->r_dir_caps = - try_prep_async_create(dir, dentry, &lo, - &req->r_deleg_ino))) { + + ceph_as_ctx_to_req(req, &as_ctx); + + if (try_async && (req->r_dir_caps = + try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino))) { + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; struct ceph_dentry_info *di = ceph_dentry(dentry); set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); req->r_callback = ceph_async_create_cb; + /* Hash inode before RPC */ + new_inode = ceph_get_inode(dir->i_sb, vino, new_inode); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); + new_inode = NULL; + goto out_req; + } + WARN_ON_ONCE(!(new_inode->i_state & I_NEW)); + spin_lock(&dentry->d_lock); di->flags |= CEPH_DENTRY_ASYNC_CREATE; spin_unlock(&dentry->d_lock); err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { - err = ceph_finish_async_create(dir, dentry, - file, mode, req, - &as_ctx, &lo); + err = ceph_finish_async_create(dir, new_inode, + dentry, file, + mode, req, + &as_ctx, &lo); + new_inode = NULL; } else if (err == -EJUKEBOX) { restore_deleg_ino(dir, req->r_deleg_ino); ceph_mdsc_put_request(req); + discard_new_inode(new_inode); + ceph_release_acl_sec_ctx(&as_ctx); + memset(&as_ctx, 0, sizeof(as_ctx)); + new_inode = NULL; try_async = false; ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto retry; @@ -830,6 +874,8 @@ retry: } set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_new_inode = new_inode; + new_inode = NULL; err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); if (err == -ENOENT) { dentry = ceph_handle_snapdir(req, dentry); @@ -855,10 +901,18 @@ retry: goto out_req; if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ - dout("atomic_open finish_no_open on dn %p\n", dn); + doutc(cl, "finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { - dout("atomic_open finish_open on dn %p\n", dn); + if (IS_ENCRYPTED(dir) && + !fscrypt_has_permitted_context(dir, d_inode(dentry))) { + pr_warn_client(cl, + "Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n", + ceph_vinop(dir), ceph_vinop(d_inode(dentry))); + goto out_req; + } + + doutc(cl, "finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { struct inode *newino = d_inode(dentry); @@ -870,19 +924,22 @@ retry: } out_req: ceph_mdsc_put_request(req); + iput(new_inode); out_ctx: ceph_release_acl_sec_ctx(&as_ctx); - dout("atomic_open result=%d\n", err); + doutc(cl, "result=%d\n", err); return err; } int ceph_release(struct inode *inode, struct file *file) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (S_ISDIR(inode->i_mode)) { struct ceph_dir_file_info *dfi = file->private_data; - dout("release inode %p dir file %p\n", inode, file); + doutc(cl, "%p %llx.%llx dir file %p\n", inode, + ceph_vinop(inode), file); WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); ceph_put_fmode(ci, dfi->file_info.fmode, 1); @@ -894,7 +951,8 @@ int ceph_release(struct inode *inode, struct file *file) kmem_cache_free(ceph_dir_file_cachep, dfi); } else { struct ceph_file_info *fi = file->private_data; - dout("release inode %p regular file %p\n", inode, file); + doutc(cl, "%p %llx.%llx regular file %p\n", inode, + ceph_vinop(inode), file); WARN_ON(!list_empty(&fi->rw_contexts)); ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); @@ -924,21 +982,26 @@ enum { * If we get a short result from the OSD, check against i_size; we need to * only return a short read to the caller if we hit EOF. */ -static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, - int *retry_op) +ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_osd_client *osdc = &fsc->client->osdc; ssize_t ret; - u64 off = iocb->ki_pos; + u64 off = *ki_pos; u64 len = iov_iter_count(to); u64 i_size = i_size_read(inode); + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 objver = 0; - dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, - (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + doutc(cl, "on inode %p %llx.%llx %llx~%llx\n", inode, + ceph_vinop(inode), *ki_pos, len); + + if (ceph_inode_is_shutdown(inode)) + return -EIO; if (!len) return 0; @@ -962,10 +1025,21 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, bool more; int idx; size_t left; + struct ceph_osd_req_op *op; + u64 read_off = off; + u64 read_len = len; + + /* determine new offset/length if encrypted */ + ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); + + doutc(cl, "orig %llu~%llu reading %llu~%llu", off, len, + read_off, read_len); req = ceph_osdc_new_request(osdc, &ci->i_layout, - ci->i_vino, off, &len, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + ci->i_vino, read_off, &read_len, 0, 1, + sparse ? CEPH_OSD_OP_SPARSE_READ : + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -973,10 +1047,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } + /* adjust len downward if the request truncated the len */ + if (off + len > read_off + read_len) + len = read_off + read_len - off; more = len < iov_iter_count(to); - num_pages = calc_pages_for(off, len); - page_off = off & ~PAGE_MASK; + num_pages = calc_pages_for(read_off, read_len); + page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ceph_osdc_put_request(req); @@ -984,29 +1061,75 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, + osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, + offset_in_page(read_off), false, false); + + op = &req->r_ops[0]; + if (sparse) { + ret = ceph_alloc_sparse_ext_map(op); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, - len, ret); + read_len, ret); - ceph_osdc_put_request(req); + if (ret > 0) + objver = req->r_version; i_size = i_size_read(inode); - dout("sync_read %llu~%llu got %zd i_size %llu%s\n", - off, len, ret, i_size, (more ? " MORE" : "")); + doutc(cl, "%llu~%llu got %zd i_size %llu%s\n", off, len, + ret, i_size, (more ? " MORE" : "")); - if (ret == -ENOENT) + /* Fix it to go to end of extent map */ + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) ret = 0; + + if (ret > 0 && IS_ENCRYPTED(inode)) { + int fret; + + fret = ceph_fscrypt_decrypt_extents(inode, pages, + read_off, op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (fret < 0) { + ret = fret; + ceph_osdc_put_request(req); + break; + } + + /* account for any partial block at the beginning */ + fret -= (off - read_off); + + /* + * Short read after big offset adjustment? + * Nothing is usable, just call it a zero + * len read. + */ + fret = max(fret, 0); + + /* account for partial block at the end */ + ret = min_t(ssize_t, fret, len); + } + + ceph_osdc_put_request(req); + + /* Short read but not EOF? Zero out the remainder. */ if (ret >= 0 && ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; - dout("sync_read zero gap %llu~%llu\n", - off + ret, off + ret + zlen); + + doutc(cl, "zero gap %llu~%llu\n", off + ret, + off + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); ret += zlen; } @@ -1014,15 +1137,16 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, idx = 0; left = ret > 0 ? ret : 0; while (left > 0) { - size_t len, copied; - page_off = off & ~PAGE_MASK; - len = min_t(size_t, left, PAGE_SIZE - page_off); + size_t plen, copied; + + plen = min_t(size_t, left, PAGE_SIZE - page_off); SetPageUptodate(pages[idx]); copied = copy_page_to_iter(pages[idx++], - page_off, len, to); + page_off, plen, to); off += copied; left -= copied; - if (copied < len) { + page_off = 0; + if (copied < plen) { ret = -EFAULT; break; } @@ -1039,21 +1163,39 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } - if (off > iocb->ki_pos) { - if (off >= i_size) { - *retry_op = CHECK_EOF; - ret = i_size - iocb->ki_pos; - iocb->ki_pos = i_size; - } else { - ret = off - iocb->ki_pos; - iocb->ki_pos = off; + if (ret > 0) { + if (off > *ki_pos) { + if (off >= i_size) { + *retry_op = CHECK_EOF; + ret = i_size - *ki_pos; + *ki_pos = i_size; + } else { + ret = off - *ki_pos; + *ki_pos = off; + } } - } - dout("sync_read result %zd retry_op %d\n", ret, *retry_op); + if (last_objver) + *last_objver = objver; + } + doutc(cl, "result %zd retry_op %d\n", ret, *retry_op); return ret; } +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, + int *retry_op) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_client *cl = ceph_inode_to_client(inode); + + doutc(cl, "on file %p %llx~%zx %s\n", file, iocb->ki_pos, + iov_iter_count(to), + (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL); +} + struct ceph_aio_request { struct kiocb *iocb; size_t total_len; @@ -1077,6 +1219,7 @@ static void ceph_aio_retry_work(struct work_struct *work); static void ceph_aio_complete(struct inode *inode, struct ceph_aio_request *aio_req) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int ret; @@ -1090,7 +1233,7 @@ static void ceph_aio_complete(struct inode *inode, if (!ret) ret = aio_req->total_len; - dout("ceph_aio_complete %p rc %d\n", inode, ret); + doutc(cl, "%p %llx.%llx rc %d\n", inode, ceph_vinop(inode), ret); if (ret >= 0 && aio_req->write) { int dirty; @@ -1125,13 +1268,17 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_osd_req_op *op = &req->r_ops[0]; struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; unsigned int len = osd_data->bvec_pos.iter.bi_size; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); + struct ceph_client *cl = ceph_inode_to_client(inode); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); - dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len); + doutc(cl, "req %p inode %p %llx.%llx, rc %d bytes %u\n", req, + inode, ceph_vinop(inode), rc, len); if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; @@ -1141,12 +1288,14 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) if (aio_work) { INIT_WORK(&aio_work->work, ceph_aio_retry_work); aio_work->req = req; - queue_work(ceph_inode_to_client(inode)->inode_wq, + queue_work(ceph_inode_to_fs_client(inode)->inode_wq, &aio_work->work); return; } rc = -ENOMEM; } else if (!aio_req->write) { + if (sparse && rc >= 0) + rc = ceph_sparse_ext_map_end(op); if (rc == -ENOENT) rc = 0; if (rc >= 0 && len > rc) { @@ -1269,7 +1418,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_client_metric *metric = &fsc->mdsc->metric; struct ceph_vino vino; struct ceph_osd_request *req; @@ -1283,13 +1433,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; bool should_dirty = !write && user_backed_iter(iter); + bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD); if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", - (write ? "write" : "read"), file, pos, (unsigned)count, - snapc, snapc ? snapc->seq : 0); + doutc(cl, "sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", + (write ? "write" : "read"), file, pos, (unsigned)count, + snapc, snapc ? snapc->seq : 0); if (write) { int ret2; @@ -1300,7 +1451,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, pos >> PAGE_SHIFT, (pos + count - 1) >> PAGE_SHIFT); if (ret2 < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret2); + doutc(cl, "invalidate_inode_pages2_range returned %d\n", + ret2); flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; } else { @@ -1310,6 +1462,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, while (iov_iter_count(iter) > 0) { u64 size = iov_iter_count(iter); ssize_t len; + struct ceph_osd_req_op *op; + int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; if (write) size = min_t(u64, size, fsc->mount_options->wsize); @@ -1320,8 +1474,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, 1, - write ? CEPH_OSD_OP_WRITE : - CEPH_OSD_OP_READ, + write ? CEPH_OSD_OP_WRITE : readop, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, @@ -1372,6 +1525,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + op = &req->r_ops[0]; + if (sparse) { + ret = ceph_alloc_sparse_ext_map(op); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } if (aio_req) { aio_req->total_len += len; @@ -1399,8 +1560,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, size = i_size_read(inode); if (!write) { - if (ret == -ENOENT) + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { struct iov_iter i; int zlen = min_t(size_t, len - ret, @@ -1480,14 +1644,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_vino vino; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; struct page **pages; u64 len; int num_pages; int written = 0; - int flags; int ret; bool check_caps = false; struct timespec64 mtime = current_time(inode); @@ -1496,8 +1660,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", - file, pos, (unsigned)count, snapc, snapc->seq); + doutc(cl, "on file %p %lld~%u snapc %p seq %lld\n", file, pos, + (unsigned)count, snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count - 1); @@ -1505,79 +1669,350 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, return ret; ceph_fscache_invalidate(inode, false); - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + count - 1) >> PAGE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); - - flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; int n; - - vino = ceph_vino(inode); - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 0, 1, - CEPH_OSD_OP_WRITE, flags, snapc, - ci->i_truncate_seq, - ci->i_truncate_size, - false); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - break; - } + u64 write_pos = pos; + u64 write_len = len; + u64 objnum, objoff; + u32 xlen; + u64 assert_ver = 0; + bool rmw; + bool first, last; + struct iov_iter saved_iter = *from; + size_t off; + + ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len); + + /* clamp the length to the end of first object */ + ceph_calc_file_object_mapping(&ci->i_layout, write_pos, + write_len, &objnum, &objoff, + &xlen); + write_len = xlen; + + /* adjust len downward if it goes beyond current object */ + if (pos + len > write_pos + write_len) + len = write_pos + write_len - pos; /* - * write from beginning of first page, - * regardless of io alignment + * If we had to adjust the length or position to align with a + * crypto block, then we must do a read/modify/write cycle. We + * use a version assertion to redrive the thing if something + * changes in between. */ - num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + first = pos != write_pos; + last = (pos + len) != (write_pos + write_len); + rmw = first || last; + doutc(cl, "ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n", + ci->i_vino.ino, pos, len, write_pos, write_len, + rmw ? "" : "no "); + + /* + * The data is emplaced into the page as it would be if it were + * in an array of pagecache pages. + */ + num_pages = calc_pages_for(write_pos, write_len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); - goto out; + break; + } + + /* Do we need to preload the pages? */ + if (rmw) { + u64 first_pos = write_pos; + u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE; + u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE; + struct ceph_osd_req_op *op; + + /* We should only need to do this for encrypted inodes */ + WARN_ON_ONCE(!IS_ENCRYPTED(inode)); + + /* No need to do two reads if first and last blocks are same */ + if (first && last_pos == first_pos) + last = false; + + /* + * Allocate a read request for one or two extents, + * depending on how the request was aligned. + */ + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, first ? first_pos : last_pos, + &read_len, 0, (first && last) ? 2 : 1, + CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ceph_release_page_vector(pages, num_pages); + ret = PTR_ERR(req); + break; + } + + /* Something is misaligned! */ + if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + ret = -EIO; + break; + } + + /* Add extent for first block? */ + op = &req->r_ops[0]; + + if (first) { + osd_req_op_extent_osd_data_pages(req, 0, pages, + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + false, false); + /* We only expect a single extent here */ + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Add extent for last block */ + if (last) { + /* Init the other extent if first extent has been used */ + if (first) { + op = &req->r_ops[1]; + osd_req_op_extent_init(req, 1, + CEPH_OSD_OP_SPARSE_READ, + last_pos, CEPH_FSCRYPT_BLOCK_SIZE, + ci->i_truncate_size, + ci->i_truncate_seq); + } + + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + osd_req_op_extent_osd_data_pages(req, first ? 1 : 0, + &pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + false, false); + } + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); + + /* FIXME: length field is wrong if there are 2 extents */ + ceph_update_read_metrics(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + read_len, ret); + + /* Ok if object is not already present */ + if (ret == -ENOENT) { + /* + * If there is no object, then we can't assert + * on its version. Set it to 0, and we'll use an + * exclusive create instead. + */ + ceph_osdc_put_request(req); + ret = 0; + + /* + * zero out the soon-to-be uncopied parts of the + * first and last pages. + */ + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + if (last) + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else { + if (ret < 0) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + op = &req->r_ops[0]; + if (op->extent.sparse_ext_cnt == 0) { + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + else + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + if (first && last) { + op = &req->r_ops[1]; + if (op->extent.sparse_ext_cnt == 0) { + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Grab assert version. It must be non-zero. */ + assert_ver = req->r_version; + WARN_ON_ONCE(ret > 0 && assert_ver == 0); + + ceph_osdc_put_request(req); + if (first) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[0], CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + if (last) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + } } left = len; + off = offset_in_page(pos); for (n = 0; n < num_pages; n++) { - size_t plen = min_t(size_t, left, PAGE_SIZE); - ret = copy_page_from_iter(pages[n], 0, plen, from); + size_t plen = min_t(size_t, left, PAGE_SIZE - off); + + /* copy the data */ + ret = copy_page_from_iter(pages[n], off, plen, from); if (ret != plen) { ret = -EFAULT; break; } + off = 0; left -= ret; } - if (ret < 0) { + doutc(cl, "write failed with %d\n", ret); ceph_release_page_vector(pages, num_pages); - goto out; + break; } - req->r_inode = inode; + if (IS_ENCRYPTED(inode)) { + ret = ceph_fscrypt_encrypt_pages(inode, pages, + write_pos, write_len, + GFP_KERNEL); + if (ret < 0) { + doutc(cl, "encryption failed with %d\n", ret); + ceph_release_page_vector(pages, num_pages); + break; + } + } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, - false, true); + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, write_pos, &write_len, + rmw ? 1 : 0, rmw ? 2 : 1, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + snapc, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + ceph_release_page_vector(pages, num_pages); + break; + } + doutc(cl, "write op %lld~%llu\n", write_pos, write_len); + osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len, + offset_in_page(write_pos), false, + true); + req->r_inode = inode; req->r_mtime = mtime; - ceph_osdc_start_request(&fsc->client->osdc, req); - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + /* Set up the assertion */ + if (rmw) { + /* + * Set up the assertion. If we don't have a version + * number, then the object doesn't exist yet. Use an + * exclusive create instead of a version assertion in + * that case. + */ + if (assert_ver) { + osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0); + req->r_ops[0].assert_ver.ver = assert_ver; + } else { + osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE, + CEPH_OSD_OP_FLAG_EXCL); + } + } + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, ret); -out: ceph_osdc_put_request(req); if (ret != 0) { + doutc(cl, "osd write returned %d\n", ret); + /* Version changed! Must re-do the rmw cycle */ + if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) || + (!assert_ver && ret == -EEXIST)) { + /* We should only ever see this on a rmw */ + WARN_ON_ONCE(!rmw); + + /* The version should never go backward */ + WARN_ON_ONCE(ret == -EOVERFLOW); + + *from = saved_iter; + + /* FIXME: limit number of times we loop? */ + continue; + } ceph_set_error_write(ci); break; } ceph_clear_error_write(ci); + + /* + * We successfully wrote to a range of the file. Declare + * that region of the pagecache invalid. + */ + ret = invalidate_inode_pages2_range( + inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + len - 1) >> PAGE_SHIFT); + if (ret < 0) { + doutc(cl, "invalidate_inode_pages2_range returned %d\n", + ret); + ret = 0; + } pos += len; written += len; + doutc(cl, "written %d\n", written); if (pos > i_size_read(inode)) { check_caps = ceph_inode_set_size(inode, pos); if (check_caps) @@ -1591,6 +2026,7 @@ out: ret = written; iocb->ki_pos = pos; } + doutc(cl, "returning %d\n", ret); return ret; } @@ -1609,13 +2045,14 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; + struct ceph_client *cl = ceph_inode_to_client(inode); ssize_t ret; int want = 0, got = 0; int retry_op = 0, read = 0; again: - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", - inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + doutc(cl, "%llu~%u trying to get caps on %p %llx.%llx\n", + iocb->ki_pos, (unsigned)len, inode, ceph_vinop(inode)); if (ceph_inode_is_shutdown(inode)) return -ESTALE; @@ -1643,12 +2080,14 @@ again: (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { - dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, - ceph_cap_string(got)); + doutc(cl, "sync %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); if (!ceph_has_inline_data(ci)) { - if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + if (!retry_op && + (iocb->ki_flags & IOCB_DIRECT) && + !IS_ENCRYPTED(inode)) { ret = ceph_direct_read_write(iocb, to, NULL, NULL); if (ret >= 0 && ret < len) @@ -1661,16 +2100,16 @@ again: } } else { CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, - ceph_cap_string(got)); + doutc(cl, "async %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); ret = generic_file_read_iter(iocb, to); ceph_del_rw_context(fi, &rw_ctx); } - dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", - inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + doutc(cl, "%p %llx.%llx dropping cap refs on %s = %d\n", + inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); if (direct_lock) @@ -1730,8 +2169,8 @@ again: /* hit EOF or hole? */ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { - dout("sync_read hit hole, ppos %lld < size %lld" - ", reading more\n", iocb->ki_pos, i_size); + doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n", + iocb->ki_pos, i_size); read += ret; len -= ret; @@ -1825,7 +2264,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; @@ -1893,8 +2333,9 @@ retry_snap: if (err) goto out; - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + doutc(cl, "%p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, + i_size_read(inode)); if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) @@ -1910,8 +2351,8 @@ retry_snap: inode_inc_iversion_raw(inode); - dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", - inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + doutc(cl, "%p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || @@ -1934,7 +2375,7 @@ retry_snap: /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode)) written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); else @@ -1971,14 +2412,14 @@ retry_snap: ceph_check_caps(ci, CHECK_CAPS_FLUSH); } - dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)count, - ceph_cap_string(got)); + doutc(cl, "%p %llx.%llx %llu~%u dropping cap refs on %s\n", + inode, ceph_vinop(inode), pos, (unsigned)count, + ceph_cap_string(got)); ceph_put_cap_refs(ci, got); if (written == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), pos, (unsigned)count); + doutc(cl, "%p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)count); goto retry_snap; } @@ -2059,7 +2500,7 @@ static int ceph_zero_partial_object(struct inode *inode, loff_t offset, loff_t *length) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_osd_request *req; int ret = 0; loff_t zero = 0; @@ -2086,7 +2527,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (ret == -ENOENT) @@ -2150,14 +2591,15 @@ static long ceph_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap_flush *prealloc_cf; + struct ceph_client *cl = ceph_inode_to_client(inode); int want, got = 0; int dirty; int ret = 0; loff_t endoff = 0; loff_t size; - dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__, - inode, ceph_vinop(inode), mode, offset, length); + doutc(cl, "%p %llx.%llx mode %x, offset %llu length %llu\n", + inode, ceph_vinop(inode), mode, offset, length); if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; @@ -2165,6 +2607,9 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; + if (IS_ENCRYPTED(inode)) + return -EOPNOTSUPP; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2283,6 +2728,7 @@ static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, loff_t src_off, loff_t dst_off, size_t len) { + struct ceph_client *cl = ceph_inode_to_client(src_inode); loff_t size, endoff; size = i_size_read(src_inode); @@ -2293,8 +2739,8 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, * inode. */ if (src_off + len > size) { - dout("Copy beyond EOF (%llu + %zu > %llu)\n", - src_off, len, size); + doutc(cl, "Copy beyond EOF (%llu + %zu > %llu)\n", src_off, + len, size); return -EOPNOTSUPP; } size = i_size_read(dst_inode); @@ -2370,6 +2816,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off u64 src_objnum, src_objoff, dst_objnum, dst_objoff; u32 src_objlen, dst_objlen; u32 object_size = src_ci->i_layout.object_size; + struct ceph_client *cl = fsc->client; int ret; src_oloc.pool = src_ci->i_layout.pool_id; @@ -2411,9 +2858,10 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off if (ret) { if (ret == -EOPNOTSUPP) { fsc->have_copy_from2 = false; - pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); + pr_notice_client(cl, + "OSDs don't support copy-from2; disabling copy offload\n"); } - dout("ceph_osdc_copy_from returned %d\n", ret); + doutc(cl, "returned %d\n", ret); if (!bytes) bytes = ret; goto out; @@ -2439,7 +2887,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *src_ci = ceph_inode(src_inode); struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; - struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); + struct ceph_fs_client *src_fsc = ceph_inode_to_fs_client(src_inode); + struct ceph_client *cl = src_fsc->client; loff_t size; ssize_t ret = -EIO, bytes; u64 src_objnum, dst_objnum, src_objoff, dst_objoff; @@ -2447,7 +2896,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; if (src_inode->i_sb != dst_inode->i_sb) { - struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); + struct ceph_fs_client *dst_fsc = ceph_inode_to_fs_client(dst_inode); if (ceph_fsid_compare(&src_fsc->client->fsid, &dst_fsc->client->fsid)) { @@ -2482,10 +2931,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, (src_ci->i_layout.stripe_count != 1) || (dst_ci->i_layout.stripe_count != 1) || (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { - dout("Invalid src/dst files layout\n"); + doutc(cl, "Invalid src/dst files layout\n"); return -EOPNOTSUPP; } + /* Every encrypted inode gets its own key, so we can't offload them */ + if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode)) + return -EOPNOTSUPP; + if (len < src_ci->i_layout.object_size) return -EOPNOTSUPP; /* no remote copy will be done */ @@ -2496,12 +2949,12 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, /* Start by sync'ing the source and destination files */ ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); if (ret < 0) { - dout("failed to write src file (%zd)\n", ret); + doutc(cl, "failed to write src file (%zd)\n", ret); goto out; } ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); if (ret < 0) { - dout("failed to write dst file (%zd)\n", ret); + doutc(cl, "failed to write dst file (%zd)\n", ret); goto out; } @@ -2513,7 +2966,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, err = get_rd_wr_caps(src_file, &src_got, dst_file, (dst_off + len), &dst_got); if (err < 0) { - dout("get_rd_wr_caps returned %d\n", err); + doutc(cl, "get_rd_wr_caps returned %d\n", err); ret = -EOPNOTSUPP; goto out; } @@ -2528,7 +2981,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, dst_off >> PAGE_SHIFT, (dst_off + len) >> PAGE_SHIFT); if (ret < 0) { - dout("Failed to invalidate inode pages (%zd)\n", ret); + doutc(cl, "Failed to invalidate inode pages (%zd)\n", + ret); ret = 0; /* XXX */ } ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, @@ -2549,7 +3003,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * starting at the src_off */ if (src_objoff) { - dout("Initial partial copy of %u bytes\n", src_objlen); + doutc(cl, "Initial partial copy of %u bytes\n", src_objlen); /* * we need to temporarily drop all caps as we'll be calling @@ -2559,8 +3013,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off, src_objlen, flags); /* Abort on short copies or on error */ - if (ret < src_objlen) { - dout("Failed partial copy (%zd)\n", ret); + if (ret < (long)src_objlen) { + doutc(cl, "Failed partial copy (%zd)\n", ret); goto out; } len -= ret; @@ -2582,7 +3036,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, ret = bytes; goto out_caps; } - dout("Copied %zu bytes out of %zu\n", bytes, len); + doutc(cl, "Copied %zu bytes out of %zu\n", bytes, len); len -= bytes; ret += bytes; @@ -2610,13 +3064,13 @@ out_caps: * there were errors in remote object copies (len >= object_size). */ if (len && (len < src_ci->i_layout.object_size)) { - dout("Final partial copy of %zu bytes\n", len); + doutc(cl, "Final partial copy of %zu bytes\n", len); bytes = do_splice_direct(src_file, &src_off, dst_file, &dst_off, len, flags); if (bytes > 0) ret += bytes; else - dout("Failed partial copy (%zd)\n", bytes); + doutc(cl, "Failed partial copy (%zd)\n", bytes); } out: diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8e5f41d45283..0679240f06db 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -14,10 +14,12 @@ #include <linux/random.h> #include <linux/sort.h> #include <linux/iversion.h> +#include <linux/fscrypt.h> #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include <linux/ceph/decode.h> /* @@ -33,6 +35,7 @@ */ static const struct inode_operations ceph_symlink_iops; +static const struct inode_operations ceph_encrypted_symlink_iops; static void ceph_inode_work(struct work_struct *work); @@ -52,20 +55,105 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } -struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +/** + * ceph_new_inode - allocate a new inode in advance of an expected create + * @dir: parent directory for new inode + * @dentry: dentry that may eventually point to new inode + * @mode: mode of new inode + * @as_ctx: pointer to inherited security context + * + * Allocate a new inode in advance of an operation to create a new inode. + * This allocates the inode and sets up the acl_sec_ctx with appropriate + * info for the new inode. + * + * Returns a pointer to the new inode or an ERR_PTR. + */ +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) { + int err; + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!S_ISLNK(*mode)) { + err = ceph_pre_init_acls(dir, mode, as_ctx); + if (err < 0) + goto out_err; + } + + inode->i_state = 0; + inode->i_mode = *mode; + + err = ceph_security_init_secctx(dentry, *mode, as_ctx); + if (err < 0) + goto out_err; + + /* + * We'll skip setting fscrypt context for snapshots, leaving that for + * the handle_reply(). + */ + if (ceph_snap(dir) != CEPH_SNAPDIR) { + err = ceph_fscrypt_prepare_context(dir, inode, as_ctx); + if (err) + goto out_err; + } + + return inode; +out_err: + iput(inode); + return ERR_PTR(err); +} + +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ + if (as_ctx->pagelist) { + req->r_pagelist = as_ctx->pagelist; + as_ctx->pagelist = NULL; + } + ceph_fscrypt_as_ctx_to_req(req, as_ctx); +} + +/** + * ceph_get_inode - find or create/hash a new inode + * @sb: superblock to search and allocate in + * @vino: vino to search for + * @newino: optional new inode to insert if one isn't found (may be NULL) + * + * Search for or insert a new inode into the hash for the given vino, and + * return a reference to it. If new is non-NULL, its reference is consumed. + */ +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, + struct inode *newino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); + struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; if (ceph_vino_is_reserved(vino)) return ERR_PTR(-EREMOTEIO); - inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, - ceph_set_ino_cb, &vino); - if (!inode) + if (newino) { + inode = inode_insert5(newino, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode != newino) + iput(newino); + } else { + inode = iget5_locked(sb, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + } + + if (!inode) { + doutc(cl, "no inode found for %llx.%llx\n", vino.ino, vino.snap); return ERR_PTR(-ENOMEM); + } - dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), - ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); + doutc(cl, "on %llx=%llx.%llx got %p new %d\n", + ceph_present_inode(inode), ceph_vinop(inode), inode, + !!(inode->i_state & I_NEW)); return inode; } @@ -74,37 +162,57 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) */ struct inode *ceph_get_snapdir(struct inode *parent) { + struct ceph_client *cl = ceph_inode_to_client(parent); struct ceph_vino vino = { .ino = ceph_ino(parent), .snap = CEPH_SNAPDIR, }; - struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL); struct ceph_inode_info *ci = ceph_inode(inode); + int ret = -ENOTDIR; if (IS_ERR(inode)) return inode; if (!S_ISDIR(parent->i_mode)) { - pr_warn_once("bad snapdir parent type (mode=0%o)\n", - parent->i_mode); + pr_warn_once_client(cl, "bad snapdir parent type (mode=0%o)\n", + parent->i_mode); goto err; } if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { - pr_warn_once("bad snapdir inode type (mode=0%o)\n", - inode->i_mode); + pr_warn_once_client(cl, "bad snapdir inode type (mode=0%o)\n", + inode->i_mode); goto err; } inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; - inode->i_mtime = parent->i_mtime; - inode->i_ctime = parent->i_ctime; - inode->i_atime = parent->i_atime; + inode_set_mtime_to_ts(inode, inode_get_mtime(parent)); + inode_set_ctime_to_ts(inode, inode_get_ctime(parent)); + inode_set_atime_to_ts(inode, inode_get_atime(parent)); ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; +#ifdef CONFIG_FS_ENCRYPTION + /* if encrypted, just borrow fscrypt_auth from parent */ + if (IS_ENCRYPTED(parent)) { + struct ceph_inode_info *pci = ceph_inode(parent); + + ci->fscrypt_auth = kmemdup(pci->fscrypt_auth, + pci->fscrypt_auth_len, + GFP_KERNEL); + if (ci->fscrypt_auth) { + inode->i_flags |= S_ENCRYPTED; + ci->fscrypt_auth_len = pci->fscrypt_auth_len; + } else { + doutc(cl, "Failed to alloc snapdir fscrypt_auth\n"); + ret = -ENOMEM; + goto err; + } + } +#endif if (inode->i_state & I_NEW) { inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; @@ -118,7 +226,7 @@ err: discard_new_inode(inode); else iput(inode); - return ERR_PTR(-ENOTDIR); + return ERR_PTR(ret); } const struct inode_operations ceph_file_iops = { @@ -145,6 +253,8 @@ const struct inode_operations ceph_file_iops = { static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, u32 f) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_frag *frag; @@ -175,8 +285,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, rb_link_node(&frag->node, parent, p); rb_insert_color(&frag->node, &ci->i_fragtree); - dout("get_or_create_frag added %llx.%llx frag %x\n", - ceph_vinop(&ci->netfs.inode), f); + doutc(cl, "added %p %llx.%llx frag %x\n", inode, ceph_vinop(inode), f); return frag; } @@ -209,6 +318,7 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); u32 t = ceph_frag_make(0, 0); struct ceph_inode_frag *frag; unsigned nway, i; @@ -232,8 +342,8 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, /* choose child */ nway = 1 << frag->split_by; - dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t, - frag->split_by, nway); + doutc(cl, "frag(%x) %x splits by %d (%d ways)\n", v, t, + frag->split_by, nway); for (i = 0; i < nway; i++) { n = ceph_frag_make_child(t, frag->split_by, i); if (ceph_frag_contains_value(n, v)) { @@ -243,7 +353,7 @@ static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, } BUG_ON(i == nway); } - dout("choose_frag(%x) = %x\n", v, t); + doutc(cl, "frag(%x) = %x\n", v, t); return t; } @@ -267,6 +377,7 @@ static int ceph_fill_dirfrag(struct inode *inode, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_frag *frag; u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); @@ -291,14 +402,14 @@ static int ceph_fill_dirfrag(struct inode *inode, goto out; if (frag->split_by == 0) { /* tree leaf, remove */ - dout("fill_dirfrag removed %llx.%llx frag %x" - " (no ref)\n", ceph_vinop(inode), id); + doutc(cl, "removed %p %llx.%llx frag %x (no ref)\n", + inode, ceph_vinop(inode), id); rb_erase(&frag->node, &ci->i_fragtree); kfree(frag); } else { /* tree branch, keep and clear */ - dout("fill_dirfrag cleared %llx.%llx frag %x" - " referral\n", ceph_vinop(inode), id); + doutc(cl, "cleared %p %llx.%llx frag %x referral\n", + inode, ceph_vinop(inode), id); frag->mds = -1; frag->ndist = 0; } @@ -311,8 +422,9 @@ static int ceph_fill_dirfrag(struct inode *inode, if (IS_ERR(frag)) { /* this is not the end of the world; we can continue with bad/inaccurate delegation info */ - pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n", - ceph_vinop(inode), le32_to_cpu(dirinfo->frag)); + pr_err_client(cl, "ENOMEM on mds ref %p %llx.%llx fg %x\n", + inode, ceph_vinop(inode), + le32_to_cpu(dirinfo->frag)); err = -ENOMEM; goto out; } @@ -321,8 +433,8 @@ static int ceph_fill_dirfrag(struct inode *inode, frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP); for (i = 0; i < frag->ndist; i++) frag->dist[i] = le32_to_cpu(dirinfo->dist[i]); - dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n", - ceph_vinop(inode), frag->frag, frag->ndist); + doutc(cl, "%p %llx.%llx frag %x ndist=%d\n", inode, + ceph_vinop(inode), frag->frag, frag->ndist); out: mutex_unlock(&ci->i_fragtree_mutex); @@ -350,6 +462,7 @@ static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; @@ -385,15 +498,15 @@ static int ceph_fill_fragtree(struct inode *inode, frag_tree_split_cmp, NULL); } - dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); split_by = le32_to_cpu(fragtree->splits[i].by); if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { - pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " - "frag %x split by %d\n", ceph_vinop(inode), - i, nsplits, id, split_by); + pr_err_client(cl, "%p %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", inode, + ceph_vinop(inode), i, nsplits, id, split_by); continue; } frag = NULL; @@ -425,7 +538,7 @@ static int ceph_fill_fragtree(struct inode *inode, if (frag->split_by == 0) ci->i_fragtree_nsplits++; frag->split_by = split_by; - dout(" frag %x split by %d\n", frag->frag, frag->split_by); + doutc(cl, " frag %x split by %d\n", frag->frag, frag->split_by); prev_frag = frag; } while (rb_node) { @@ -450,6 +563,7 @@ out_unlock: */ struct inode *ceph_alloc_inode(struct super_block *sb) { + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); struct ceph_inode_info *ci; int i; @@ -457,7 +571,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) if (!ci) return NULL; - dout("alloc_inode %p\n", &ci->netfs.inode); + doutc(fsc->client, "%p\n", &ci->netfs.inode); /* Set parameters for the netfs library */ netfs_inode_init(&ci->netfs, &ceph_netfs_ops); @@ -517,6 +631,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; + ci->i_truncate_pagecache_size = 0; ci->i_max_size = 0; ci->i_reported_size = 0; @@ -547,6 +662,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); +#ifdef CONFIG_FS_ENCRYPTION + ci->fscrypt_auth = NULL; + ci->fscrypt_auth_len = 0; +#endif return &ci->netfs.inode; } @@ -555,6 +674,10 @@ void ceph_free_inode(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); kfree(ci->i_symlink); +#ifdef CONFIG_FS_ENCRYPTION + kfree(ci->fscrypt_auth); +#endif + fscrypt_free_inode(inode); kmem_cache_free(ceph_inode_cachep, ci); } @@ -562,10 +685,11 @@ void ceph_evict_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_frag *frag; struct rb_node *n; - dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + doutc(cl, "%p ino %llx.%llx\n", inode, ceph_vinop(inode)); percpu_counter_dec(&mdsc->metric.total_inodes); @@ -575,6 +699,7 @@ void ceph_evict_inode(struct inode *inode) clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); + fscrypt_put_encryption_info(inode); __ceph_remove_caps(ci); @@ -587,8 +712,8 @@ void ceph_evict_inode(struct inode *inode) */ if (ci->i_snap_realm) { if (ceph_snap(inode) == CEPH_NOSNAP) { - dout(" dropping residual ref to snap realm %p\n", - ci->i_snap_realm); + doutc(cl, " dropping residual ref to snap realm %p\n", + ci->i_snap_realm); ceph_change_snap_realm(inode, NULL); } else { ceph_put_snapid_map(mdsc, ci->i_snapid_map); @@ -629,15 +754,16 @@ static inline blkcnt_t calc_inode_blocks(u64 size) int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int queue_trunc = 0; loff_t isize = i_size_read(inode); if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || (truncate_seq == ci->i_truncate_seq && size > isize)) { - dout("size %lld -> %llu\n", isize, size); + doutc(cl, "size %lld -> %llu\n", isize, size); if (size > 0 && S_ISDIR(inode->i_mode)) { - pr_err("fill_file_size non-zero size for directory\n"); + pr_err_client(cl, "non-zero size for directory\n"); size = 0; } i_size_write(inode, size); @@ -650,14 +776,12 @@ int ceph_fill_file_size(struct inode *inode, int issued, ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { - dout("truncate_seq %u -> %u\n", - ci->i_truncate_seq, truncate_seq); + doutc(cl, "truncate_seq %u -> %u\n", + ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; /* the MDS should have revoked these caps */ - WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR | + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're @@ -674,11 +798,27 @@ int ceph_fill_file_size(struct inode *inode, int issued, } } } - if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && - ci->i_truncate_size != truncate_size) { - dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, - truncate_size); + + /* + * It's possible that the new sizes of the two consecutive + * size truncations will be in the same fscrypt last block, + * and we need to truncate the corresponding page caches + * anyway. + */ + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) { + doutc(cl, "truncate_size %lld -> %llu, encrypted %d\n", + ci->i_truncate_size, truncate_size, + !!IS_ENCRYPTED(inode)); + ci->i_truncate_size = truncate_size; + + if (IS_ENCRYPTED(inode)) { + doutc(cl, "truncate_pagecache_size %lld -> %llu\n", + ci->i_truncate_pagecache_size, size); + ci->i_truncate_pagecache_size = size; + } else { + ci->i_truncate_pagecache_size = truncate_size; + } } return queue_trunc; } @@ -687,7 +827,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, struct timespec64 *atime) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 ictime = inode_get_ctime(inode); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -696,39 +838,41 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { if (ci->i_version == 0 || - timespec64_compare(ctime, &inode->i_ctime) > 0) { - dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + timespec64_compare(ctime, &ictime) > 0) { + doutc(cl, "ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", + ictime.tv_sec, ictime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = *ctime; + inode_set_ctime_to_ts(inode, *ctime); } if (ci->i_version == 0 || ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ - dout("mtime %lld.%09ld -> %lld.%09ld " - "tw %d -> %d\n", - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + doutc(cl, "mtime %lld.%09ld -> %lld.%09ld tw %d -> %d\n", + inode_get_mtime_sec(inode), + inode_get_mtime_nsec(inode), mtime->tv_sec, mtime->tv_nsec, ci->i_time_warp_seq, (int)time_warp_seq); - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { + struct timespec64 ts; + /* nobody did utimes(); take the max */ - if (timespec64_compare(mtime, &inode->i_mtime) > 0) { - dout("mtime %lld.%09ld -> %lld.%09ld inc\n", - inode->i_mtime.tv_sec, - inode->i_mtime.tv_nsec, + ts = inode_get_mtime(inode); + if (timespec64_compare(mtime, &ts) > 0) { + doutc(cl, "mtime %lld.%09ld -> %lld.%09ld inc\n", + ts.tv_sec, ts.tv_nsec, mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = *mtime; + inode_set_mtime_to_ts(inode, *mtime); } - if (timespec64_compare(atime, &inode->i_atime) > 0) { - dout("atime %lld.%09ld -> %lld.%09ld inc\n", - inode->i_atime.tv_sec, - inode->i_atime.tv_nsec, + ts = inode_get_atime(inode); + if (timespec64_compare(atime, &ts) > 0) { + doutc(cl, "atime %lld.%09ld -> %lld.%09ld inc\n", + ts.tv_sec, ts.tv_nsec, atime->tv_sec, atime->tv_nsec); - inode->i_atime = *atime; + inode_set_atime_to_ts(inode, *atime); } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -738,19 +882,53 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = *ctime; - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode_set_ctime_to_ts(inode, *ctime); + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; } } if (warn) /* time_warp_seq shouldn't go backwards */ - dout("%p mds time_warp_seq %llu < %u\n", - inode, time_warp_seq, ci->i_time_warp_seq); + doutc(cl, "%p mds time_warp_seq %llu < %u\n", inode, + time_warp_seq, ci->i_time_warp_seq); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, + const char *encsym, + int enclen, u8 **decsym) +{ + struct ceph_client *cl = mdsc->fsc->client; + int declen; + u8 *sym; + + sym = kmalloc(enclen + 1, GFP_NOFS); + if (!sym) + return -ENOMEM; + + declen = ceph_base64_decode(encsym, enclen, sym); + if (declen < 0) { + pr_err_client(cl, + "can't decode symlink (%d). Content: %.*s\n", + declen, enclen, encsym); + kfree(sym); + return -EIO; + } + sym[declen + 1] = '\0'; + *decsym = sym; + return declen; +} +#else +static int decode_encrypted_symlink(struct ceph_mds_client *mdsc, + const char *encsym, + int symlen, u8 **decsym) +{ + return -EOPNOTSUPP; +} +#endif + /* * Populate an inode based on info from mds. May be called on new or * existing inodes. @@ -762,6 +940,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, struct ceph_cap_reservation *caps_reservation) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int issued, new_issued, info_caps; @@ -780,25 +959,26 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, lockdep_assert_held(&mdsc->snap_rwsem); - dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, - inode, ceph_vinop(inode), le64_to_cpu(info->version), - ci->i_version); + doutc(cl, "%p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), + le64_to_cpu(info->version), ci->i_version); /* Once I_NEW is cleared, we can't change type or dev numbers */ if (inode->i_state & I_NEW) { inode->i_mode = mode; } else { if (inode_wrong_type(inode, mode)) { - pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", - ceph_vinop(inode), inode->i_mode, mode); + pr_warn_once_client(cl, + "inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); return -ESTALE; } if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) { - pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", - ceph_vinop(inode), MAJOR(inode->i_rdev), - MINOR(inode->i_rdev), MAJOR(rdev), - MINOR(rdev)); + pr_warn_once_client(cl, + "dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", + ceph_vinop(inode), MAJOR(inode->i_rdev), + MINOR(inode->i_rdev), MAJOR(rdev), + MINOR(rdev)); return -ESTALE; } } @@ -820,8 +1000,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (iinfo->xattr_len > 4) { xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); if (!xattr_blob) - pr_err("%s ENOMEM xattr blob %d bytes\n", __func__, - iinfo->xattr_len); + pr_err_client(cl, "ENOMEM xattr blob %d bytes\n", + iinfo->xattr_len); } if (iinfo->pool_ns_len > 0) @@ -856,27 +1036,42 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; - /* directories have fl_stripe_unit set to zero */ - if (le32_to_cpu(info->layout.fl_stripe_unit)) - inode->i_blkbits = - fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - else - inode->i_blkbits = CEPH_BLOCK_SHIFT; - __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); +#ifdef CONFIG_FS_ENCRYPTION + if (iinfo->fscrypt_auth_len && + ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) { + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; + ci->fscrypt_auth = iinfo->fscrypt_auth; + iinfo->fscrypt_auth = NULL; + iinfo->fscrypt_auth_len = 0; + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + } +#endif + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); - dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - from_kuid(&init_user_ns, inode->i_uid), - from_kgid(&init_user_ns, inode->i_gid)); + doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, + ceph_vinop(inode), inode->i_mode, + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); ceph_decode_timespec64(&ci->i_btime, &iinfo->btime); ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); } + /* directories have fl_stripe_unit set to zero */ + if (IS_ENCRYPTED(inode)) + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + else if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); @@ -898,6 +1093,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + u64 size = le64_to_cpu(info->size); s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; @@ -911,15 +1107,28 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, pool_ns = old_ns; + if (IS_ENCRYPTED(inode) && size && + iinfo->fscrypt_file_len == sizeof(__le64)) { + u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file); + + if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) { + size = fsize; + } else { + pr_warn_client(cl, + "fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n", + info->size, size); + } + } + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); + size); /* only update max_size on auth cap */ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); + doutc(cl, "max_size %lld -> %llu\n", + ci->i_max_size, le64_to_cpu(info->max_size)); ci->i_max_size = le64_to_cpu(info->max_size); } } @@ -974,26 +1183,45 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, inode->i_fop = &ceph_file_fops; break; case S_IFLNK: - inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { u32 symlen = iinfo->symlink_len; char *sym; spin_unlock(&ci->i_ceph_lock); - if (symlen != i_size_read(inode)) { - pr_err("%s %llx.%llx BAD symlink " - "size %lld\n", __func__, - ceph_vinop(inode), - i_size_read(inode)); + if (IS_ENCRYPTED(inode)) { + if (symlen != i_size_read(inode)) + pr_err_client(cl, + "%p %llx.%llx BAD symlink size %lld\n", + inode, ceph_vinop(inode), + i_size_read(inode)); + + err = decode_encrypted_symlink(mdsc, iinfo->symlink, + symlen, (u8 **)&sym); + if (err < 0) { + pr_err_client(cl, + "decoding encrypted symlink failed: %d\n", + err); + goto out; + } + symlen = err; i_size_write(inode, symlen); inode->i_blocks = calc_inode_blocks(symlen); - } + } else { + if (symlen != i_size_read(inode)) { + pr_err_client(cl, + "%p %llx.%llx BAD symlink size %lld\n", + inode, ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } - err = -ENOMEM; - sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); - if (!sym) - goto out; + err = -ENOMEM; + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); + if (!sym) + goto out; + } spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) @@ -1001,15 +1229,25 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, else kfree(sym); /* lost a race */ } - inode->i_link = ci->i_symlink; + + if (IS_ENCRYPTED(inode)) { + /* + * Encrypted symlinks need to be decrypted before we can + * cache their targets in i_link. Don't touch it here. + */ + inode->i_op = &ceph_encrypted_symlink_iops; + } else { + inode->i_link = ci->i_symlink; + inode->i_op = &ceph_symlink_iops; + } break; case S_IFDIR: inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; break; default: - pr_err("%s %llx.%llx BAD mode 0%o\n", __func__, - ceph_vinop(inode), inode->i_mode); + pr_err_client(cl, "%p %llx.%llx BAD mode 0%o\n", inode, + ceph_vinop(inode), inode->i_mode); } /* were we issued a capability? */ @@ -1030,7 +1268,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, (info_caps & CEPH_CAP_FILE_SHARED) && (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { - dout(" marking %p complete (empty)\n", inode); + doutc(cl, " marking %p complete (empty)\n", + inode); i_size_write(inode, 0); __ceph_dir_set_complete(ci, atomic64_read(&ci->i_release_count), @@ -1039,8 +1278,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, wake = true; } else { - dout(" %p got snap_caps %s\n", inode, - ceph_cap_string(info_caps)); + doutc(cl, " %p got snap_caps %s\n", inode, + ceph_cap_string(info_caps)); ci->i_snap_caps |= info_caps; } } @@ -1056,8 +1295,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (cap_fmode >= 0) { if (!info_caps) - pr_warn("mds issued no caps on %llx.%llx\n", - ceph_vinop(inode)); + pr_warn_client(cl, "mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); __ceph_touch_fmode(ci, mdsc, cap_fmode); } @@ -1103,14 +1342,14 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, unsigned long from_time, struct ceph_mds_session **old_lease_session) { + struct ceph_client *cl = ceph_inode_to_client(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); unsigned mask = le16_to_cpu(lease->mask); long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; - dout("update_dentry_lease %p duration %lu ms ttl %lu\n", - dentry, duration, ttl); + doutc(cl, "%p duration %lu ms ttl %lu\n", dentry, duration, ttl); /* only track leases on regular dentries */ if (ceph_snap(dir) != CEPH_NOSNAP) @@ -1211,6 +1450,7 @@ out_unlock: */ static int splice_dentry(struct dentry **pdn, struct inode *in) { + struct ceph_client *cl = ceph_inode_to_client(in); struct dentry *dn = *pdn; struct dentry *realdn; @@ -1242,23 +1482,21 @@ static int splice_dentry(struct dentry **pdn, struct inode *in) d_drop(dn); realdn = d_splice_alias(in, dn); if (IS_ERR(realdn)) { - pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", - PTR_ERR(realdn), dn, in, ceph_vinop(in)); + pr_err_client(cl, "error %ld %p inode %p ino %llx.%llx\n", + PTR_ERR(realdn), dn, in, ceph_vinop(in)); return PTR_ERR(realdn); } if (realdn) { - dout("dn %p (%d) spliced with %p (%d) " - "inode %p ino %llx.%llx\n", - dn, d_count(dn), - realdn, d_count(realdn), - d_inode(realdn), ceph_vinop(d_inode(realdn))); + doutc(cl, "dn %p (%d) spliced with %p (%d) inode %p ino %llx.%llx\n", + dn, d_count(dn), realdn, d_count(realdn), + d_inode(realdn), ceph_vinop(d_inode(realdn))); dput(dn); *pdn = realdn; } else { BUG_ON(!ceph_dentry(dn)); - dout("dn %p attached to %p ino %llx.%llx\n", - dn, d_inode(dn), ceph_vinop(d_inode(dn))); + doutc(cl, "dn %p attached to %p ino %llx.%llx\n", dn, + d_inode(dn), ceph_vinop(d_inode(dn))); } return 0; } @@ -1280,14 +1518,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; struct ceph_vino tvino, dvino; - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_client *cl = fsc->client; int err = 0; - dout("fill_trace %p is_dentry %d is_target %d\n", req, - rinfo->head->is_dentry, rinfo->head->is_target); + doutc(cl, "%p is_dentry %d is_target %d\n", req, + rinfo->head->is_dentry, rinfo->head->is_target); if (!rinfo->head->is_target && !rinfo->head->is_dentry) { - dout("fill_trace reply is empty!\n"); + doutc(cl, "reply is empty!\n"); if (rinfo->head->result == 0 && req->r_parent) ceph_invalidate_dir_request(req); return 0; @@ -1309,8 +1548,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); @@ -1318,36 +1564,55 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) parent = d_find_any_alias(dir); BUG_ON(!parent); - dname.name = rinfo->dname; - dname.len = rinfo->dname_len; + err = ceph_fname_alloc_buffer(dir, &oname); + if (err < 0) { + dput(parent); + goto done; + } + + err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); + if (err < 0) { + dput(parent); + ceph_fname_free_buffer(dir, &oname); + goto done; + } + dname.name = oname.name; + dname.len = oname.len; dname.hash = full_name_hash(parent, dname.name, dname.len); tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); retry_lookup: dn = d_lookup(parent, &dname); - dout("d_lookup on parent=%p name=%.*s got %p\n", - parent, dname.len, dname.name, dn); + doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); if (!dn) { dn = d_alloc(parent, &dname); - dout("d_alloc %p '%.*s' = %p\n", parent, - dname.len, dname.name, dn); + doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); if (!dn) { dput(parent); + ceph_fname_free_buffer(dir, &oname); err = -ENOMEM; goto done; } + if (is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } err = 0; } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || ceph_snap(d_inode(dn)) != tvino.snap)) { - dout(" dn %p points to wrong inode %p\n", - dn, d_inode(dn)); + doutc(cl, " dn %p points to wrong inode %p\n", + dn, d_inode(dn)); ceph_dir_clear_ordered(dir); d_delete(dn); dput(dn); goto retry_lookup; } + ceph_fname_free_buffer(dir, &oname); req->r_dentry = dn; dput(parent); @@ -1366,8 +1631,8 @@ retry_lookup: rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { - pr_err("ceph_fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); + pr_err_client(cl, "badness %p %llx.%llx\n", in, + ceph_vinop(in)); req->r_target_inode = NULL; if (in->i_state & I_NEW) discard_new_inode(in); @@ -1417,36 +1682,32 @@ retry_lookup: have_lease = have_dir_cap || le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) - dout("fill_trace no dentry lease or dir cap\n"); + doutc(cl, "no dentry lease or dir cap\n"); /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { struct inode *olddir = req->r_old_dentry_dir; BUG_ON(!olddir); - dout(" src %p '%pd' dst %p '%pd'\n", - req->r_old_dentry, - req->r_old_dentry, - dn, dn); - dout("fill_trace doing d_move %p -> %p\n", - req->r_old_dentry, dn); + doutc(cl, " src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, req->r_old_dentry, dn, dn); + doutc(cl, "doing d_move %p -> %p\n", req->r_old_dentry, dn); /* d_move screws up sibling dentries' offsets */ ceph_dir_clear_ordered(dir); ceph_dir_clear_ordered(olddir); d_move(req->r_old_dentry, dn); - dout(" src %p '%pd' dst %p '%pd'\n", - req->r_old_dentry, - req->r_old_dentry, - dn, dn); + doutc(cl, " src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, req->r_old_dentry, dn, dn); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, - ceph_dentry(req->r_old_dentry)->offset); + doutc(cl, "dn %p gets new offset %lld\n", + req->r_old_dentry, + ceph_dentry(req->r_old_dentry)->offset); /* swap r_dentry and r_old_dentry in case that * splice_dentry() gets called later. This is safe @@ -1458,9 +1719,9 @@ retry_lookup: /* null dentry? */ if (!rinfo->head->is_target) { - dout("fill_trace null dentry\n"); + doutc(cl, "null dentry\n"); if (d_really_is_positive(dn)) { - dout("d_delete %p\n", dn); + doutc(cl, "d_delete %p\n", dn); ceph_dir_clear_ordered(dir); d_delete(dn); } else if (have_lease) { @@ -1484,9 +1745,9 @@ retry_lookup: goto done; dn = req->r_dentry; /* may have spliced */ } else if (d_really_is_positive(dn) && d_inode(dn) != in) { - dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, d_inode(dn), ceph_vinop(d_inode(dn)), - ceph_vinop(in)); + doutc(cl, " %p links to %p %llx.%llx, not %llx.%llx\n", + dn, d_inode(dn), ceph_vinop(d_inode(dn)), + ceph_vinop(in)); d_invalidate(dn); have_lease = false; } @@ -1496,7 +1757,7 @@ retry_lookup: rinfo->dlease, session, req->r_request_started); } - dout(" final dn %p\n", dn); + doutc(cl, " final dn %p\n", dn); } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP) && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && @@ -1507,7 +1768,8 @@ retry_lookup: BUG_ON(!dir); BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); BUG_ON(!req->r_dentry); - dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry); + doutc(cl, " linking snapped dir %p to dn %p\n", in, + req->r_dentry); ceph_dir_clear_ordered(dir); ihold(in); err = splice_dentry(&req->r_dentry, in); @@ -1529,7 +1791,7 @@ retry_lookup: &dvino, ptvino); } done: - dout("fill_trace done err=%d\n", err); + doutc(cl, "done err=%d\n", err); return err; } @@ -1540,6 +1802,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_client *cl = session->s_mdsc->fsc->client; int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { @@ -1551,17 +1814,17 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); - in = ceph_get_inode(req->r_dentry->d_sb, vino); + in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL); if (IS_ERR(in)) { err = PTR_ERR(in); - dout("new_inode badness got %d\n", err); + doutc(cl, "badness got %d\n", err); continue; } rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, -1, &req->r_caps_reservation); if (rc < 0) { - pr_err("ceph_fill_inode badness on %p got %d\n", - in, rc); + pr_err_client(cl, "inode badness on %p got %d\n", in, + rc); err = rc; if (in->i_state & I_NEW) { ihold(in); @@ -1590,6 +1853,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, struct ceph_readdir_cache_control *ctl, struct ceph_mds_request *req) { + struct ceph_client *cl = ceph_inode_to_client(dir); struct ceph_inode_info *ci = ceph_inode(dir); unsigned nsize = PAGE_SIZE / sizeof(struct dentry*); unsigned idx = ctl->index % nsize; @@ -1615,11 +1879,11 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { - dout("readdir cache dn %p idx %d\n", dn, ctl->index); + doutc(cl, "dn %p idx %d\n", dn, ctl->index); ctl->dentries[idx] = dn; ctl->index++; } else { - dout("disable readdir cache\n"); + doutc(cl, "disable readdir cache\n"); ctl->index = -1; } return 0; @@ -1629,8 +1893,10 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + struct inode *inode = d_inode(parent); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_client *cl = session->s_mdsc->fsc->client; struct qstr dname; struct dentry *dn; struct inode *in; @@ -1658,19 +1924,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { - dout("readdir_prepopulate got new frag %x -> %x\n", - frag, le32_to_cpu(rinfo->dir_dir->frag)); + doutc(cl, "got new frag %x -> %x\n", frag, + le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); if (!rinfo->hash_order) req->r_readdir_offset = 2; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", - rinfo->dir_nr, parent); + doutc(cl, "%d items under SNAPDIR dn %p\n", + rinfo->dir_nr, parent); } else { - dout("readdir_prepopulate %d items under dn %p\n", - rinfo->dir_nr, parent); + doutc(cl, "%d items under dn %p\n", rinfo->dir_nr, parent); if (rinfo->dir_dir) ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); @@ -1703,9 +1968,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, tvino.snap = le64_to_cpu(rde->inode.in->snapid); if (rinfo->hash_order) { - u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, - rde->name, rde->name_len); - hash = ceph_frag_value(hash); + u32 hash = ceph_frag_value(rde->raw_hash); if (hash != last_hash) fpos_offset = 2; last_hash = hash; @@ -1716,24 +1979,29 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, retry_lookup: dn = d_lookup(parent, &dname); - dout("d_lookup on parent=%p name=%.*s got %p\n", - parent, dname.len, dname.name, dn); + doutc(cl, "d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); if (!dn) { dn = d_alloc(parent, &dname); - dout("d_alloc %p '%.*s' = %p\n", parent, - dname.len, dname.name, dn); + doutc(cl, "d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); if (!dn) { - dout("d_alloc badness\n"); + doutc(cl, "d_alloc badness\n"); err = -ENOMEM; goto out; } + if (rde->is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || ceph_snap(d_inode(dn)) != tvino.snap)) { struct ceph_dentry_info *di = ceph_dentry(dn); - dout(" dn %p points to wrong inode %p\n", - dn, d_inode(dn)); + doutc(cl, " dn %p points to wrong inode %p\n", + dn, d_inode(dn)); spin_lock(&dn->d_lock); if (di->offset > 0 && @@ -1753,9 +2021,9 @@ retry_lookup: if (d_really_is_positive(dn)) { in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, tvino); + in = ceph_get_inode(parent->d_sb, tvino, NULL); if (IS_ERR(in)) { - dout("new_inode badness\n"); + doutc(cl, "new_inode badness\n"); d_drop(dn); dput(dn); err = PTR_ERR(in); @@ -1766,7 +2034,8 @@ retry_lookup: ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, -1, &req->r_caps_reservation); if (ret < 0) { - pr_err("ceph_fill_inode badness on %p\n", in); + pr_err_client(cl, "badness on %p %llx.%llx\n", in, + ceph_vinop(in)); if (d_really_is_negative(dn)) { if (in->i_state & I_NEW) { ihold(in); @@ -1783,8 +2052,8 @@ retry_lookup: if (d_really_is_negative(dn)) { if (ceph_security_xattr_deadlock(in)) { - dout(" skip splicing dn %p to inode %p" - " (security xattr deadlock)\n", dn, in); + doutc(cl, " skip splicing dn %p to inode %p" + " (security xattr deadlock)\n", dn, in); iput(in); skipped++; goto next_item; @@ -1816,17 +2085,18 @@ out: req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); - dout("readdir_prepopulate done\n"); + doutc(cl, "done\n"); return err; } bool ceph_inode_set_size(struct inode *inode, loff_t size) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); bool ret; spin_lock(&ci->i_ceph_lock); - dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); + doutc(cl, "set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); ceph_fscache_update(inode); inode->i_blocks = calc_inode_blocks(size); @@ -1840,22 +2110,25 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) void ceph_queue_inode_work(struct inode *inode, int work_bit) { - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); set_bit(work_bit, &ci->i_work_mask); ihold(inode); if (queue_work(fsc->inode_wq, &ci->i_work)) { - dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask); + doutc(cl, "%p %llx.%llx mask=%lx\n", inode, + ceph_vinop(inode), ci->i_work_mask); } else { - dout("queue_inode_work %p already queued, mask=%lx\n", - inode, ci->i_work_mask); + doutc(cl, "%p %llx.%llx already queued, mask=%lx\n", + inode, ceph_vinop(inode), ci->i_work_mask); iput(inode); } } static void ceph_do_invalidate_pages(struct inode *inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 orig_gen; int check = 0; @@ -1865,8 +2138,9 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_lock(&ci->i_truncate_mutex); if (ceph_inode_is_shutdown(inode)) { - pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n", - __func__, ceph_vinop(inode)); + pr_warn_ratelimited_client(cl, + "%p %llx.%llx is shut down\n", inode, + ceph_vinop(inode)); mapping_set_error(inode->i_mapping, -EIO); truncate_pagecache(inode, 0); mutex_unlock(&ci->i_truncate_mutex); @@ -1874,8 +2148,8 @@ static void ceph_do_invalidate_pages(struct inode *inode) } spin_lock(&ci->i_ceph_lock); - dout("invalidate_pages %p gen %d revoking %d\n", inode, - ci->i_rdcache_gen, ci->i_rdcache_revoking); + doutc(cl, "%p %llx.%llx gen %d revoking %d\n", inode, + ceph_vinop(inode), ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) check = 1; @@ -1887,21 +2161,21 @@ static void ceph_do_invalidate_pages(struct inode *inode) spin_unlock(&ci->i_ceph_lock); if (invalidate_inode_pages2(inode->i_mapping) < 0) { - pr_err("invalidate_inode_pages2 %llx.%llx failed\n", - ceph_vinop(inode)); + pr_err_client(cl, "invalidate_inode_pages2 %llx.%llx failed\n", + ceph_vinop(inode)); } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && orig_gen == ci->i_rdcache_revoking) { - dout("invalidate_pages %p gen %d successful\n", inode, - ci->i_rdcache_gen); + doutc(cl, "%p %llx.%llx gen %d successful\n", inode, + ceph_vinop(inode), ci->i_rdcache_gen); ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", - inode, orig_gen, ci->i_rdcache_gen, - ci->i_rdcache_revoking); + doutc(cl, "%p %llx.%llx gen %d raced, now %d revoking %d\n", + inode, ceph_vinop(inode), orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) check = 1; } @@ -1918,6 +2192,7 @@ out: */ void __ceph_do_pending_vmtruncate(struct inode *inode) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 to; int wrbuffer_refs, finish = 0; @@ -1926,7 +2201,8 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { - dout("__do_pending_vmtruncate %p none pending\n", inode); + doutc(cl, "%p %llx.%llx none pending\n", inode, + ceph_vinop(inode)); spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); return; @@ -1938,8 +2214,8 @@ retry: */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { spin_unlock(&ci->i_ceph_lock); - dout("__do_pending_vmtruncate %p flushing snaps first\n", - inode); + doutc(cl, "%p %llx.%llx flushing snaps first\n", inode, + ceph_vinop(inode)); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -1948,17 +2224,17 @@ retry: /* there should be no reader or writer */ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); - to = ci->i_truncate_size; + to = ci->i_truncate_pagecache_size; wrbuffer_refs = ci->i_wrbuffer_ref; - dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, - ci->i_truncate_pending, to); + doutc(cl, "%p %llx.%llx (%d) to %lld\n", inode, ceph_vinop(inode), + ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); ceph_fscache_resize(inode, to); truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); - if (to == ci->i_truncate_size) { + if (to == ci->i_truncate_pagecache_size) { ci->i_truncate_pending = 0; finish = 1; } @@ -1979,9 +2255,10 @@ static void ceph_inode_work(struct work_struct *work) struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_work); struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) { - dout("writeback %p\n", inode); + doutc(cl, "writeback %p %llx.%llx\n", inode, ceph_vinop(inode)); filemap_fdatawrite(&inode->i_data); } if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask)) @@ -1999,6 +2276,32 @@ static void ceph_inode_work(struct work_struct *work) iput(inode); } +static const char *ceph_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!dentry) + return ERR_PTR(-ECHILD); + + return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode), + done); +} + +static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + int ret; + + ret = ceph_getattr(idmap, path, stat, request_mask, query_flags); + if (ret) + return ret; + return fscrypt_symlink_getattr(path, stat); +} + /* * symlinks */ @@ -2009,20 +2312,174 @@ static const struct inode_operations ceph_symlink_iops = { .listxattr = ceph_listxattr, }; -int __ceph_setattr(struct inode *inode, struct iattr *attr) +static const struct inode_operations ceph_encrypted_symlink_iops = { + .get_link = ceph_encrypted_get_link, + .setattr = ceph_setattr, + .getattr = ceph_encrypted_symlink_getattr, + .listxattr = ceph_listxattr, +}; + +/* + * Transfer the encrypted last block to the MDS and the MDS + * will help update it when truncating a smaller size. + * + * We don't support a PAGE_SIZE that is smaller than the + * CEPH_FSCRYPT_BLOCK_SIZE. + */ +static int fill_fscrypt_truncate(struct inode *inode, + struct ceph_mds_request *req, + struct iattr *attr) +{ + struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE; + loff_t pos, orig_pos = round_down(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE); + u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT; + struct ceph_pagelist *pagelist = NULL; + struct kvec iov = {0}; + struct iov_iter iter; + struct page *page = NULL; + struct ceph_fscrypt_truncate_size_header header; + int retry_op = 0; + int len = CEPH_FSCRYPT_BLOCK_SIZE; + loff_t i_size = i_size_read(inode); + int got, ret, issued; + u64 objver; + + ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got); + if (ret < 0) + return ret; + + issued = __ceph_caps_issued(ci, NULL); + + doutc(cl, "size %lld -> %lld got cap refs on %s, issued %s\n", + i_size, attr->ia_size, ceph_cap_string(got), + ceph_cap_string(issued)); + + /* Try to writeback the dirty pagecaches */ + if (issued & (CEPH_CAP_FILE_BUFFER)) { + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + + ret = filemap_write_and_wait_range(inode->i_mapping, + orig_pos, lend); + if (ret < 0) + goto out; + } + + page = __page_cache_alloc(GFP_KERNEL); + if (page == NULL) { + ret = -ENOMEM; + goto out; + } + + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) { + ret = -ENOMEM; + goto out; + } + + iov.iov_base = kmap_local_page(page); + iov.iov_len = len; + iov_iter_kvec(&iter, READ, &iov, 1, len); + + pos = orig_pos; + ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver); + if (ret < 0) + goto out; + + /* Insert the header first */ + header.ver = 1; + header.compat = 1; + header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode)); + + /* + * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE, + * because in MDS it may need this to do the truncate. + */ + header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE); + + /* + * If we hit a hole here, we should just skip filling + * the fscrypt for the request, because once the fscrypt + * is enabled, the file will be split into many blocks + * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there + * has a hole, the hole size should be multiple of block + * size. + * + * If the Rados object doesn't exist, it will be set to 0. + */ + if (!objver) { + doutc(cl, "hit hole, ppos %lld < size %lld\n", pos, i_size); + + header.data_len = cpu_to_le32(8 + 8 + 4); + header.file_offset = 0; + ret = 0; + } else { + header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); + header.file_offset = cpu_to_le64(orig_pos); + + doutc(cl, "encrypt block boff/bsize %d/%lu\n", boff, + CEPH_FSCRYPT_BLOCK_SIZE); + + /* truncate and zero out the extra contents for the last block */ + memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); + + /* encrypt the last block */ + ret = ceph_fscrypt_encrypt_block_inplace(inode, page, + CEPH_FSCRYPT_BLOCK_SIZE, + 0, block, + GFP_KERNEL); + if (ret) + goto out; + } + + /* Insert the header */ + ret = ceph_pagelist_append(pagelist, &header, sizeof(header)); + if (ret) + goto out; + + if (header.block_size) { + /* Append the last block contents to pagelist */ + ret = ceph_pagelist_append(pagelist, iov.iov_base, + CEPH_FSCRYPT_BLOCK_SIZE); + if (ret) + goto out; + } + req->r_pagelist = pagelist; +out: + doutc(cl, "%p %llx.%llx size dropping cap refs on %s\n", inode, + ceph_vinop(inode), ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (iov.iov_base) + kunmap_local(iov.iov_base); + if (page) + __free_pages(page, 0); + if (ret && pagelist) + ceph_pagelist_release(pagelist); + return ret; +} + +int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, + struct iattr *attr, struct ceph_iattr *cia) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap_flush *prealloc_cf; + loff_t isize = i_size_read(inode); int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; + bool fill_fscrypt; + int truncate_retry = 20; /* The RMW will take around 50ms */ +retry: prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2034,6 +2491,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) return PTR_ERR(req); } + fill_fscrypt = false; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); @@ -2048,41 +2506,85 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } - dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); + doutc(cl, "%p %llx.%llx issued %s\n", inode, ceph_vinop(inode), + ceph_cap_string(issued)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (cia && cia->fscrypt_auth) { + u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth); + + if (len > sizeof(*cia->fscrypt_auth)) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } + + doutc(cl, "%p %llx.%llx fscrypt_auth len %u to %u)\n", inode, + ceph_vinop(inode), ci->fscrypt_auth_len, len); + + /* It should never be re-set once set */ + WARN_ON_ONCE(ci->fscrypt_auth); + + if (issued & CEPH_CAP_AUTH_EXCL) { + dirtied |= CEPH_CAP_AUTH_EXCL; + kfree(ci->fscrypt_auth); + ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; + ci->fscrypt_auth_len = len; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + ci->fscrypt_auth_len != len || + memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) { + req->r_fscrypt_auth = cia->fscrypt_auth; + mask |= CEPH_SETATTR_FSCRYPT_AUTH; + release |= CEPH_CAP_AUTH_SHARED; + } + cia->fscrypt_auth = NULL; + } +#else + if (cia && cia->fscrypt_auth) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } +#endif /* CONFIG_FS_ENCRYPTION */ if (ia_valid & ATTR_UID) { - dout("setattr %p uid %d -> %d\n", inode, - from_kuid(&init_user_ns, inode->i_uid), - from_kuid(&init_user_ns, attr->ia_uid)); + kuid_t fsuid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); + + doutc(cl, "%p %llx.%llx uid %d -> %d\n", inode, + ceph_vinop(inode), + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); if (issued & CEPH_CAP_AUTH_EXCL) { - inode->i_uid = attr->ia_uid; + inode->i_uid = fsuid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - !uid_eq(attr->ia_uid, inode->i_uid)) { + !uid_eq(fsuid, inode->i_uid)) { req->r_args.setattr.uid = cpu_to_le32( - from_kuid(&init_user_ns, attr->ia_uid)); + from_kuid(&init_user_ns, fsuid)); mask |= CEPH_SETATTR_UID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_GID) { - dout("setattr %p gid %d -> %d\n", inode, - from_kgid(&init_user_ns, inode->i_gid), - from_kgid(&init_user_ns, attr->ia_gid)); + kgid_t fsgid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); + + doutc(cl, "%p %llx.%llx gid %d -> %d\n", inode, + ceph_vinop(inode), + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); if (issued & CEPH_CAP_AUTH_EXCL) { - inode->i_gid = attr->ia_gid; + inode->i_gid = fsgid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - !gid_eq(attr->ia_gid, inode->i_gid)) { + !gid_eq(fsgid, inode->i_gid)) { req->r_args.setattr.gid = cpu_to_le32( - from_kgid(&init_user_ns, attr->ia_gid)); + from_kgid(&init_user_ns, fsgid)); mask |= CEPH_SETATTR_GID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_MODE) { - dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode, - attr->ia_mode); + doutc(cl, "%p %llx.%llx mode 0%o -> 0%o\n", inode, + ceph_vinop(inode), inode->i_mode, attr->ia_mode); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_mode = attr->ia_mode; dirtied |= CEPH_CAP_AUTH_EXCL; @@ -2096,20 +2598,23 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } if (ia_valid & ATTR_ATIME) { - dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode, - inode->i_atime.tv_sec, inode->i_atime.tv_nsec, - attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + struct timespec64 atime = inode_get_atime(inode); + + doutc(cl, "%p %llx.%llx atime %lld.%09ld -> %lld.%09ld\n", + inode, ceph_vinop(inode), + atime.tv_sec, atime.tv_nsec, + attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec64_compare(&inode->i_atime, - &attr->ia_atime) < 0) { - inode->i_atime = attr->ia_atime; + timespec64_compare(&atime, + &attr->ia_atime) < 0) { + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { + !timespec64_equal(&atime, &attr->ia_atime)) { ceph_encode_timespec64(&req->r_args.setattr.atime, &attr->ia_atime); mask |= CEPH_SETATTR_ATIME; @@ -2118,10 +2623,28 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_SIZE) { - loff_t isize = i_size_read(inode); - - dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + doutc(cl, "%p %llx.%llx size %lld -> %lld\n", inode, + ceph_vinop(inode), isize, attr->ia_size); + /* + * Only when the new size is smaller and not aligned to + * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed. + */ + if (IS_ENCRYPTED(inode) && attr->ia_size < isize && + (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) { + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + fill_fscrypt = true; + } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { if (attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); @@ -2131,28 +2654,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || attr->ia_size != isize) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + if (IS_ENCRYPTED(inode) && attr->ia_size) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + } else { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); + req->r_fscrypt_file = 0; + } } } if (ia_valid & ATTR_MTIME) { - dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + struct timespec64 mtime = inode_get_mtime(inode); + + doutc(cl, "%p %llx.%llx mtime %lld.%09ld -> %lld.%09ld\n", + inode, ceph_vinop(inode), + mtime.tv_sec, mtime.tv_nsec, + attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec64_compare(&inode->i_mtime, - &attr->ia_mtime) < 0) { - inode->i_mtime = attr->ia_mtime; + timespec64_compare(&mtime, &attr->ia_mtime) < 0) { + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { + !timespec64_equal(&mtime, &attr->ia_mtime)) { ceph_encode_timespec64(&req->r_args.setattr.mtime, &attr->ia_mtime); mask |= CEPH_SETATTR_MTIME; @@ -2165,10 +2703,12 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (ia_valid & ATTR_CTIME) { bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; - dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, - only ? "ctime only" : "ignored"); + doutc(cl, "%p %llx.%llx ctime %lld.%09ld -> %lld.%09ld (%s)\n", + inode, ceph_vinop(inode), + inode_get_ctime_sec(inode), + inode_get_ctime_nsec(inode), + attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + only ? "ctime only" : "ignored"); if (only) { /* * if kernel wants to dirty ctime but nothing else, @@ -2186,19 +2726,22 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_FILE) - dout("setattr %p ATTR_FILE ... hrm!\n", inode); + doutc(cl, "%p %llx.%llx ATTR_FILE ... hrm!\n", inode, + ceph_vinop(inode)); if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); inode_inc_iversion_raw(inode); } release &= issued; spin_unlock(&ci->i_ceph_lock); - if (lock_snap_rwsem) + if (lock_snap_rwsem) { up_read(&mdsc->snap_rwsem); + lock_snap_rwsem = false; + } if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -2210,10 +2753,32 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; req->r_stamp = attr->ia_ctime; + if (fill_fscrypt) { + err = fill_fscrypt_truncate(inode, req, attr); + if (err) + goto out; + } + + /* + * The truncate request will return -EAGAIN when the + * last block has been updated just before the MDS + * successfully gets the xlock for the FILE lock. To + * avoid corrupting the file contents we need to retry + * it. + */ err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == -EAGAIN && truncate_retry--) { + doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote), retry it!\n", + inode, ceph_vinop(inode), err, + ceph_cap_string(dirtied), mask); + ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); + goto retry; + } } - dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, - ceph_cap_string(dirtied), mask); +out: + doutc(cl, "%p %llx.%llx result=%d (%s locally, %d remote)\n", inode, + ceph_vinop(inode), err, ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); ceph_free_cap_flush(prealloc_cf); @@ -2231,7 +2796,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int err; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -2240,7 +2805,11 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (ceph_inode_is_shutdown(inode)) return -ESTALE; - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + + err = setattr_prepare(idmap, dentry, attr); if (err != 0) return err; @@ -2252,10 +2821,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) return -EDQUOT; - err = __ceph_setattr(inode, attr); + err = __ceph_setattr(idmap, inode, attr, NULL); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode); + err = posix_acl_chmod(idmap, dentry, attr->ia_mode); return err; } @@ -2297,19 +2866,21 @@ int ceph_try_to_choose_auth_mds(struct inode *inode, int mask) int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int mode; int err; if (ceph_snap(inode) == CEPH_SNAPDIR) { - dout("do_getattr inode %p SNAPDIR\n", inode); + doutc(cl, "inode %p %llx.%llx SNAPDIR\n", inode, + ceph_vinop(inode)); return 0; } - dout("do_getattr inode %p mask %s mode 0%o\n", - inode, ceph_cap_string(mask), inode->i_mode); + doutc(cl, "inode %p %llx.%llx mask %s mode 0%o\n", inode, + ceph_vinop(inode), ceph_cap_string(mask), inode->i_mode); if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1)) return 0; @@ -2336,14 +2907,15 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, } } ceph_mdsc_put_request(req); - dout("do_getattr result=%d\n", err); + doutc(cl, "result=%d\n", err); return err; } int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int mode = USE_AUTH_MDS; @@ -2373,7 +2945,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, xattr_value = req->r_reply_info.xattr_info.xattr_value; xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; - dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); + doutc(cl, "xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); err = (int)xattr_value_len; if (size == 0) @@ -2388,7 +2960,7 @@ int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, put: ceph_mdsc_put_request(req); out: - dout("do_getvxattr result=%d\n", err); + doutc(cl, "result=%d\n", err); return err; } @@ -2408,7 +2980,7 @@ int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(&nop_mnt_idmap, inode, mask); + err = generic_permission(idmap, inode, mask); return err; } @@ -2465,7 +3037,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, return err; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->ino = ceph_present_inode(inode); /* @@ -2488,7 +3060,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; if (S_ISDIR(inode->i_mode)) { - if (ceph_test_mount_opt(ceph_sb_to_client(sb), RBYTES)) { + if (ceph_test_mount_opt(ceph_sb_to_fs_client(sb), RBYTES)) { stat->size = ci->i_rbytes; } else if (ceph_snap(inode) == CEPH_SNAPDIR) { struct ceph_inode_info *pci; @@ -2523,8 +3095,12 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } - stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; + if (IS_ENCRYPTED(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC | + STATX_ATTR_ENCRYPTED); + stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index deac817647eb..e861de3c79b9 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -6,6 +6,7 @@ #include "mds_client.h" #include "ioctl.h" #include <linux/ceph/striper.h> +#include <linux/fscrypt.h> /* * ioctls @@ -64,7 +65,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc, static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; struct ceph_inode_info *ci = ceph_inode(file_inode(file)); @@ -139,7 +140,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) struct ceph_mds_request *req; struct ceph_ioctl_layout l; int err; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) @@ -182,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = - &ceph_sb_to_client(inode->i_sb)->client->osdc; + &ceph_sb_to_fs_client(inode->i_sb)->client->osdc; struct ceph_object_locator oloc; CEPH_DEFINE_OID_ONSTACK(oid); u32 xlen; @@ -243,7 +244,8 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; + struct ceph_client *cl = mdsc->fsc->client; if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { spin_lock(&ci->i_ceph_lock); @@ -251,11 +253,13 @@ static long ceph_ioctl_lazyio(struct file *file) ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; __ceph_touch_fmode(ci, mdsc, fi->fmode); spin_unlock(&ci->i_ceph_lock); - dout("ioctl_layzio: file %p marked lazy\n", file); + doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode, + ceph_vinop(inode)); ceph_check_caps(ci, 0); } else { - dout("ioctl_layzio: file %p already lazy\n", file); + doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode, + ceph_vinop(inode)); } return 0; } @@ -268,9 +272,98 @@ static long ceph_ioctl_syncio(struct file *file) return 0; } +static int vet_mds_for_fscrypt(struct file *file) +{ + int i, ret = -EOPNOTSUPP; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = mdsc->sessions[i]; + + if (!s) + continue; + if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features)) + ret = 0; + break; + } + mutex_unlock(&mdsc->mutex); + return ret; +} + +static long ceph_set_encryption_policy(struct file *file, unsigned long arg) +{ + int ret, got = 0; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + + /* encrypted directories can't have striped layout */ + if (ci->i_layout.stripe_count > 1) + return -EINVAL; + + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + + /* + * Ensure we hold these caps so that we _know_ that the rstats check + * in the empty_dir check is reliable. + */ + ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got); + if (ret) + return ret; + + ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg); + if (got) + ceph_put_cap_refs(ci, got); + + return ret; +} + +static const char *ceph_ioctl_cmd_name(const unsigned int cmd) +{ + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return "get_layout"; + case CEPH_IOC_SET_LAYOUT: + return "set_layout"; + case CEPH_IOC_SET_LAYOUT_POLICY: + return "set_layout_policy"; + case CEPH_IOC_GET_DATALOC: + return "get_dataloc"; + case CEPH_IOC_LAZYIO: + return "lazyio"; + case CEPH_IOC_SYNCIO: + return "syncio"; + case FS_IOC_SET_ENCRYPTION_POLICY: + return "set_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY: + return "get_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return "get_encryption_policy_ex"; + case FS_IOC_ADD_ENCRYPTION_KEY: + return "add_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return "remove_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return "remove_encryption_key_all_users"; + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return "get_encryption_key_status"; + case FS_IOC_GET_ENCRYPTION_NONCE: + return "get_encryption_nonce"; + default: + return "unknown"; + } +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + int ret; + + doutc(fsc->client, "file %p %p %llx.%llx cmd %s arg %lu\n", file, + inode, ceph_vinop(inode), ceph_ioctl_cmd_name(cmd), arg); switch (cmd) { case CEPH_IOC_GET_LAYOUT: return ceph_ioctl_get_layout(file, (void __user *)arg); @@ -289,6 +382,43 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SYNCIO: return ceph_ioctl_syncio(file); + + case FS_IOC_SET_ENCRYPTION_POLICY: + return ceph_set_encryption_policy(file, arg); + + case FS_IOC_GET_ENCRYPTION_POLICY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_add_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return fscrypt_ioctl_remove_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return fscrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return fscrypt_ioctl_get_key_status(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_NONCE: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_nonce(file, (void __user *)arg); } return -ENOTTY; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index cb51c7e9c8e2..e07ad29ff8b9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -77,6 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, int cmd, u8 wait, struct file_lock *fl) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; int err; u64 length = 0; @@ -111,10 +112,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, owner = secure_addr(fl->fl_owner); - dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " - "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type, - (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, - wait, fl->fl_type); + doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d\n", + (int)lock_type, (int)operation, owner, (u64)fl->fl_pid, + fl->fl_start, length, wait, fl->fl_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; @@ -147,16 +148,17 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, } ceph_mdsc_put_request(req); - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type, err); + doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type: %d, err code %d\n", + (int)lock_type, (int)operation, (u64)fl->fl_pid, + fl->fl_start, length, wait, fl->fl_type, err); return err; } static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *intr_req; struct inode *inode = req->r_inode; int err, lock_type; @@ -174,8 +176,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, if (!err) return 0; - dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", - req->r_tid); + doutc(cl, "request %llu was interrupted\n", req->r_tid); mutex_lock(&mdsc->mutex); if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { @@ -246,6 +247,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int err = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; u8 wait = 0; @@ -257,7 +259,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) if (ceph_inode_is_shutdown(inode)) return -ESTALE; - dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); + doutc(cl, "fl_owner: %p\n", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ if (IS_GETLK(cmd)) @@ -292,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) { - dout("mds locked, locking locally\n"); + doutc(cl, "locking locally\n"); err = posix_lock_file(file, fl, NULL); if (err) { /* undo! This should only happen if @@ -300,8 +302,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) * deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on posix_lock_file, undid lock\n", - err); + doutc(cl, "got %d on posix_lock_file, undid lock\n", + err); } } } @@ -312,6 +314,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int err = 0; u8 wait = 0; u8 lock_cmd; @@ -322,7 +325,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (ceph_inode_is_shutdown(inode)) return -ESTALE; - dout("ceph_flock, fl_file: %p\n", fl->fl_file); + doutc(cl, "fl_file: %p\n", fl->fl_file); spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { @@ -359,7 +362,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, inode, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on locks_lock_file_wait, undid lock\n", err); + doutc(cl, "got %d on locks_lock_file_wait, undid lock\n", + err); } } return err; @@ -371,6 +375,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) */ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct file_lock *lock; struct file_lock_context *ctx; @@ -386,17 +391,20 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) ++(*flock_count); spin_unlock(&ctx->flc_lock); } - dout("counted %d flock locks and %d fcntl locks\n", - *flock_count, *fcntl_count); + doutc(cl, "counted %d flock locks and %d fcntl locks\n", + *flock_count, *fcntl_count); } /* * Given a pointer to a lock, convert it to a ceph filelock */ -static int lock_to_ceph_filelock(struct file_lock *lock, +static int lock_to_ceph_filelock(struct inode *inode, + struct file_lock *lock, struct ceph_filelock *cephlock) { + struct ceph_client *cl = ceph_inode_to_client(inode); int err = 0; + cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); @@ -414,7 +422,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock, cephlock->type = CEPH_LOCK_UNLOCK; break; default: - dout("Have unknown lock type %d\n", lock->fl_type); + doutc(cl, "Have unknown lock type %d\n", lock->fl_type); err = -EINVAL; } @@ -432,13 +440,14 @@ int ceph_encode_locks_to_buffer(struct inode *inode, { struct file_lock *lock; struct file_lock_context *ctx = locks_inode_context(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); int err = 0; int seen_fcntl = 0; int seen_flock = 0; int l = 0; - dout("encoding %d flock and %d fcntl locks\n", num_flock_locks, - num_fcntl_locks); + doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks, + num_fcntl_locks); if (!ctx) return 0; @@ -450,7 +459,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &flocks[l]); + err = lock_to_ceph_filelock(inode, lock, &flocks[l]); if (err) goto fail; ++l; @@ -461,7 +470,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &flocks[l]); + err = lock_to_ceph_filelock(inode, lock, &flocks[l]); if (err) goto fail; ++l; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5fb367b1d4b0..d95eb525519a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -12,9 +12,11 @@ #include <linux/bits.h> #include <linux/ktime.h> #include <linux/bitmap.h> +#include <linux/mnt_idmapping.h> #include "super.h" #include "mds_client.h" +#include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> @@ -184,8 +186,54 @@ static int parse_reply_info_in(void **p, void *end, info->rsnaps = 0; } + if (struct_v >= 5) { + u32 alen; + + ceph_decode_32_safe(p, end, alen, bad); + + while (alen--) { + u32 len; + + /* key */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + /* value */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + } + } + + /* fscrypt flag -- ignore */ + if (struct_v >= 6) + ceph_decode_skip_8(p, end, bad); + + info->fscrypt_auth = NULL; + info->fscrypt_auth_len = 0; + info->fscrypt_file = NULL; + info->fscrypt_file_len = 0; + if (struct_v >= 7) { + ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad); + if (info->fscrypt_auth_len) { + info->fscrypt_auth = kmalloc(info->fscrypt_auth_len, + GFP_KERNEL); + if (!info->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_auth, + info->fscrypt_auth_len, bad); + } + ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad); + if (info->fscrypt_file_len) { + info->fscrypt_file = kmalloc(info->fscrypt_file_len, + GFP_KERNEL); + if (!info->fscrypt_file) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_file, + info->fscrypt_file_len, bad); + } + } *p = end; } else { + /* legacy (unversioned) struct */ if (features & CEPH_FEATURE_MDS_INLINE_DATA) { ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); @@ -263,27 +311,47 @@ bad: static int parse_reply_info_lease(void **p, void *end, struct ceph_mds_reply_lease **lease, - u64 features) + u64 features, u32 *altname_len, u8 **altname) { + u8 struct_v; + u32 struct_len; + void *lend; + if (features == (u64)-1) { - u8 struct_v, struct_compat; - u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand * encoding whose struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); - ceph_decode_need(p, end, struct_len, bad); - end = *p + struct_len; + } else { + struct_len = sizeof(**lease); + *altname_len = 0; + *altname = NULL; } - ceph_decode_need(p, end, sizeof(**lease), bad); + lend = *p + struct_len; + ceph_decode_need(p, end, struct_len, bad); *lease = *p; *p += sizeof(**lease); - if (features == (u64)-1) - *p = end; + + if (features == (u64)-1) { + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, *altname_len, bad); + ceph_decode_need(p, end, *altname_len, bad); + *altname = *p; + *p += *altname_len; + } else { + *altname = NULL; + *altname_len = 0; + } + } + *p = lend; return 0; bad: return -EIO; @@ -313,7 +381,8 @@ static int parse_reply_info_trace(void **p, void *end, info->dname = *p; *p += info->dname_len; - err = parse_reply_info_lease(p, end, &info->dlease, features); + err = parse_reply_info_lease(p, end, &info->dlease, features, + &info->altname_len, &info->altname); if (err < 0) goto out_bad; } @@ -339,9 +408,11 @@ out_bad: * parse readdir results */ static int parse_reply_info_readdir(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, - u64 features) + struct ceph_mds_request *req, + u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; + struct ceph_client *cl = req->r_mdsc->fsc->client; u32 num, i = 0; int err; @@ -364,25 +435,94 @@ static int parse_reply_info_readdir(void **p, void *end, BUG_ON(!info->dir_entries); if ((unsigned long)(info->dir_entries + num) > (unsigned long)info->dir_entries + info->dir_buf_size) { - pr_err("dir contents are larger than expected\n"); + pr_err_client(cl, "dir contents are larger than expected\n"); WARN_ON(1); goto bad; } info->dir_nr = num; while (num) { + struct inode *inode = d_inode(req->r_dentry); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + struct fscrypt_str tname = FSTR_INIT(NULL, 0); + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname; + u32 altname_len, _name_len; + u8 *altname, *_name; + /* dentry */ - ceph_decode_32_safe(p, end, rde->name_len, bad); - ceph_decode_need(p, end, rde->name_len, bad); - rde->name = *p; - *p += rde->name_len; - dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + ceph_decode_32_safe(p, end, _name_len, bad); + ceph_decode_need(p, end, _name_len, bad); + _name = *p; + *p += _name_len; + doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name); + + if (info->hash_order) + rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + _name, _name_len); /* dentry lease */ - err = parse_reply_info_lease(p, end, &rde->lease, features); + err = parse_reply_info_lease(p, end, &rde->lease, features, + &altname_len, &altname); if (err) goto out_bad; + + /* + * Try to dencrypt the dentry names and update them + * in the ceph_mds_reply_dir_entry struct. + */ + fname.dir = inode; + fname.name = _name; + fname.name_len = _name_len; + fname.ctext = altname; + fname.ctext_len = altname_len; + /* + * The _name_len maybe larger than altname_len, such as + * when the human readable name length is in range of + * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE), + * then the copy in ceph_fname_to_usr will corrupt the + * data if there has no encryption key. + * + * Just set the no_copy flag and then if there has no + * encryption key the oname.name will be assigned to + * _name always. + */ + fname.no_copy = true; + if (altname_len == 0) { + /* + * Set tname to _name, and this will be used + * to do the base64_decode in-place. It's + * safe because the decoded string should + * always be shorter, which is 3/4 of origin + * string. + */ + tname.name = _name; + + /* + * Set oname to _name too, and this will be + * used to do the dencryption in-place. + */ + oname.name = _name; + oname.len = _name_len; + } else { + /* + * This will do the decryption only in-place + * from altname cryptext directly. + */ + oname.name = altname; + oname.len = altname_len; + } + rde->is_nokey = false; + err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey); + if (err) { + pr_err_client(cl, "unable to decode %.*s, got %d\n", + _name_len, _name, err); + goto out_bad; + } + rde->name = oname.name; + rde->name_len = oname.len; + /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) @@ -401,7 +541,7 @@ done: bad: err = -EIO; out_bad: - pr_err("problem parsing dir contents %d\n", err); + pr_err_client(cl, "problem parsing dir contents %d\n", err); return err; } @@ -432,10 +572,11 @@ bad: static int ceph_parse_deleg_inos(void **p, void *end, struct ceph_mds_session *s) { + struct ceph_client *cl = s->s_mdsc->fsc->client; u32 sets; ceph_decode_32_safe(p, end, sets, bad); - dout("got %u sets of delegated inodes\n", sets); + doutc(cl, "got %u sets of delegated inodes\n", sets); while (sets--) { u64 start, len; @@ -444,8 +585,9 @@ static int ceph_parse_deleg_inos(void **p, void *end, /* Don't accept a delegation of system inodes */ if (start < CEPH_INO_SYSTEM_BASE) { - pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", - start, len); + pr_warn_ratelimited_client(cl, + "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", + start, len); continue; } while (len--) { @@ -453,10 +595,10 @@ static int ceph_parse_deleg_inos(void **p, void *end, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { - dout("added delegated inode 0x%llx\n", - start - 1); + doutc(cl, "added delegated inode 0x%llx\n", start - 1); } else if (err == -EBUSY) { - pr_warn("MDS delegated inode 0x%llx more than once.\n", + pr_warn_client(cl, + "MDS delegated inode 0x%llx more than once.\n", start - 1); } else { return err; @@ -581,15 +723,16 @@ bad: * parse extra results */ static int parse_reply_info_extra(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, + struct ceph_mds_request *req, u64 features, struct ceph_mds_session *s) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; u32 op = le32_to_cpu(info->head->op); if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_readdir(p, end, info, features); + return parse_reply_info_readdir(p, end, req, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); else if (op == CEPH_MDS_OP_GETVXATTR) @@ -602,9 +745,10 @@ static int parse_reply_info_extra(void **p, void *end, * parse entire mds reply */ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, - struct ceph_mds_reply_info_parsed *info, - u64 features) + struct ceph_mds_request *req, u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; + struct ceph_client *cl = s->s_mdsc->fsc->client; void *p, *end; u32 len; int err; @@ -626,7 +770,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_extra(&p, p+len, info, features, s); + err = parse_reply_info_extra(&p, p+len, req, features, s); if (err < 0) goto out_bad; } @@ -644,15 +788,28 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, bad: err = -EIO; out_bad: - pr_err("mds parse_reply err %d\n", err); + pr_err_client(cl, "mds parse_reply err %d\n", err); ceph_msg_dump(msg); return err; } static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { + int i; + + kfree(info->diri.fscrypt_auth); + kfree(info->diri.fscrypt_file); + kfree(info->targeti.fscrypt_auth); + kfree(info->targeti.fscrypt_file); if (!info->dir_entries) return; + + for (i = 0; i < info->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + + kfree(rde->inode.fscrypt_auth); + kfree(rde->inode.fscrypt_file); + } free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -678,7 +835,8 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) */ int ceph_wait_on_conflict_unlink(struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); + struct ceph_client *cl = fsc->client; struct dentry *pdentry = dentry->d_parent; struct dentry *udentry, *found = NULL; struct ceph_dentry_info *di; @@ -703,14 +861,14 @@ int ceph_wait_on_conflict_unlink(struct dentry *dentry) goto next; if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) - pr_warn("%s dentry %p:%pd async unlink bit is not set\n", - __func__, dentry, dentry); + pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n", + dentry, dentry); if (!d_same_name(udentry, pdentry, &dname)) goto next; + found = dget_dlock(udentry); spin_unlock(&udentry->d_lock); - found = dget(udentry); break; next: spin_unlock(&udentry->d_lock); @@ -720,8 +878,8 @@ next: if (likely(!found)) return 0; - dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__, - dentry, dentry, found, found); + doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry, + found, found); err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, TASK_KILLABLE); @@ -805,6 +963,7 @@ static int __verify_registered_session(struct ceph_mds_client *mdsc, static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, int mds) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *s; if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) @@ -821,7 +980,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, int newmax = 1 << get_count_order(mds + 1); struct ceph_mds_session **sa; - dout("%s: realloc to %d\n", __func__, newmax); + doutc(cl, "realloc to %d\n", newmax); sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); if (!sa) goto fail_realloc; @@ -834,7 +993,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->max_sessions = newmax; } - dout("%s: mds%d\n", __func__, mds); + doutc(cl, "mds%d\n", mds); s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; @@ -877,7 +1036,7 @@ fail_realloc: static void __unregister_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { - dout("__unregister_session mds%d %p\n", s->s_mds, s); + doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); @@ -945,6 +1104,7 @@ void ceph_mdsc_release_request(struct kref *kref) iput(req->r_parent); } iput(req->r_target_inode); + iput(req->r_new_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -963,8 +1123,12 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req->r_path1); kfree(req->r_path2); put_cred(req->r_cred); + if (req->r_mnt_idmap) + mnt_idmap_put(req->r_mnt_idmap); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); + kfree(req->r_fscrypt_auth); + kfree(req->r_altname); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); @@ -1000,6 +1164,7 @@ static void __register_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, struct inode *dir) { + struct ceph_client *cl = mdsc->fsc->client; int ret = 0; req->r_tid = ++mdsc->last_tid; @@ -1007,18 +1172,20 @@ static void __register_request(struct ceph_mds_client *mdsc, ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation, req->r_num_caps); if (ret < 0) { - pr_err("__register_request %p " - "failed to reserve caps: %d\n", req, ret); + pr_err_client(cl, "%p failed to reserve caps: %d\n", + req, ret); /* set req->r_err to fail early from __do_request */ req->r_err = ret; return; } } - dout("__register_request %p tid %lld\n", req, req->r_tid); + doutc(cl, "%p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); insert_request(&mdsc->request_tree, req); req->r_cred = get_current_cred(); + if (!req->r_mnt_idmap) + req->r_mnt_idmap = &nop_mnt_idmap; if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) mdsc->oldest_tid = req->r_tid; @@ -1037,7 +1204,7 @@ static void __register_request(struct ceph_mds_client *mdsc, static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { - dout("__unregister_request %p tid %lld\n", req, req->r_tid); + doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid); /* Never leave an unregistered request on an unsafe list! */ list_del_init(&req->r_unsafe_item); @@ -1123,6 +1290,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, int mds = -1; u32 hash = req->r_direct_hash; bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + struct ceph_client *cl = mdsc->fsc->client; if (random) *random = false; @@ -1134,8 +1302,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_resend_mds >= 0 && (__have_session(mdsc, req->r_resend_mds) || ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { - dout("%s using resend_mds mds%d\n", __func__, - req->r_resend_mds); + doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds); return req->r_resend_mds; } @@ -1152,7 +1319,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, rcu_read_lock(); inode = get_nonsnap_parent(req->r_dentry); rcu_read_unlock(); - dout("%s using snapdir's parent %p\n", __func__, inode); + doutc(cl, "using snapdir's parent %p %llx.%llx\n", + inode, ceph_vinop(inode)); } } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ @@ -1172,7 +1340,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, /* direct snapped/virtual snapdir requests * based on parent dir inode */ inode = get_nonsnap_parent(parent); - dout("%s using nonsnap parent %p\n", __func__, inode); + doutc(cl, "using nonsnap parent %p %llx.%llx\n", + inode, ceph_vinop(inode)); } else { /* dentry target */ inode = d_inode(req->r_dentry); @@ -1188,10 +1357,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, rcu_read_unlock(); } - dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash, - hash, mode); if (!inode) goto random; + + doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode, + ceph_vinop(inode), (int)is_hash, hash, mode); ci = ceph_inode(inode); if (is_hash && S_ISDIR(inode->i_mode)) { @@ -1207,9 +1377,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, get_random_bytes(&r, 1); r %= frag.ndist; mds = frag.dist[r]; - dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n", - __func__, inode, ceph_vinop(inode), - frag.frag, mds, (int)r, frag.ndist); + doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n", + inode, ceph_vinop(inode), frag.frag, + mds, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE && !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) @@ -1222,9 +1392,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (frag.mds >= 0) { /* choose auth mds */ mds = frag.mds; - dout("%s %p %llx.%llx frag %u mds%d (auth)\n", - __func__, inode, ceph_vinop(inode), - frag.frag, mds); + doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n", + inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) { if (!ceph_mdsmap_is_laggy(mdsc->mdsmap, @@ -1248,9 +1417,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, goto random; } mds = cap->session->s_mds; - dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__, - inode, ceph_vinop(inode), mds, - cap == ci->i_auth_cap ? "auth " : "", cap); + doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode, + ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); out: iput(inode); @@ -1261,7 +1430,7 @@ random: *random = true; mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); - dout("%s chose random mds%d\n", __func__, mds); + doutc(cl, "chose random mds%d\n", mds); return mds; } @@ -1374,6 +1543,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; + struct ceph_client *cl = mdsc->fsc->client; size_t size, count; void *p, *end; int ret; @@ -1412,7 +1582,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); if (!msg) { - pr_err("ENOMEM creating session open msg\n"); + pr_err_client(cl, "ENOMEM creating session open msg\n"); return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; @@ -1452,14 +1622,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ret = encode_supported_features(&p, end); if (ret) { - pr_err("encode_supported_features failed!\n"); + pr_err_client(cl, "encode_supported_features failed!\n"); ceph_msg_put(msg); return ERR_PTR(ret); } ret = encode_metric_spec(&p, end); if (ret) { - pr_err("encode_metric_spec failed!\n"); + pr_err_client(cl, "encode_metric_spec failed!\n"); ceph_msg_put(msg); return ERR_PTR(ret); } @@ -1487,8 +1657,8 @@ static int __open_session(struct ceph_mds_client *mdsc, /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); - dout("open_session to mds%d (%s)\n", mds, - ceph_mds_state_name(mstate)); + doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); session->s_state = CEPH_MDS_SESSION_OPENING; session->s_renew_requested = jiffies; @@ -1531,8 +1701,9 @@ struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) { struct ceph_mds_session *session; + struct ceph_client *cl = mdsc->fsc->client; - dout("open_export_target_session to mds%d\n", target); + doutc(cl, "to mds%d\n", target); mutex_lock(&mdsc->mutex); session = __open_export_target_session(mdsc, target); @@ -1547,13 +1718,14 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; + struct ceph_client *cl = mdsc->fsc->client; if (mds >= mdsc->mdsmap->possible_max_rank) return; mi = &mdsc->mdsmap->m_info[mds]; - dout("open_export_target_sessions for mds%d (%d targets)\n", - session->s_mds, mi->num_export_targets); + doutc(cl, "for mds%d (%d targets)\n", session->s_mds, + mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { ts = __open_export_target_session(mdsc, mi->export_targets[i]); @@ -1576,11 +1748,13 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, static void detach_cap_releases(struct ceph_mds_session *session, struct list_head *target) { + struct ceph_client *cl = session->s_mdsc->fsc->client; + lockdep_assert_held(&session->s_cap_lock); list_splice_init(&session->s_cap_releases, target); session->s_num_cap_releases = 0; - dout("dispose_cap_releases mds%d\n", session->s_mds); + doutc(cl, "mds%d\n", session->s_mds); } static void dispose_cap_releases(struct ceph_mds_client *mdsc, @@ -1598,16 +1772,17 @@ static void dispose_cap_releases(struct ceph_mds_client *mdsc, static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct rb_node *p; - dout("cleanup_session_requests mds%d\n", session->s_mds); + doutc(cl, "mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); - pr_warn_ratelimited(" dropping unsafe request %llu\n", - req->r_tid); + pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n", + req->r_tid); if (req->r_target_inode) mapping_set_error(req->r_target_inode->i_mapping, -EIO); if (req->r_unsafe_dir) @@ -1636,13 +1811,14 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg) { + struct ceph_client *cl = session->s_mdsc->fsc->client; struct list_head *p; struct ceph_cap *cap; struct inode *inode, *last_inode = NULL; struct ceph_cap *old_cap = NULL; int ret; - dout("iterate_session_caps %p mds%d\n", session, session->s_mds); + doutc(cl, "%p mds%d\n", session, session->s_mds); spin_lock(&session->s_cap_lock); p = session->s_caps.next; while (p != &session->s_caps) { @@ -1673,8 +1849,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, spin_lock(&session->s_cap_lock); p = p->next; if (!cap->ci) { - dout("iterate_session_caps finishing cap %p removal\n", - cap); + doutc(cl, "finishing cap %p removal\n", cap); BUG_ON(cap->session != session); cap->session = NULL; list_del_init(&cap->session_caps); @@ -1703,6 +1878,7 @@ out: static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_client *cl = ceph_inode_to_client(inode); bool invalidate = false; struct ceph_cap *cap; int iputs = 0; @@ -1710,8 +1886,8 @@ static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (cap) { - dout(" removing cap %p, ci is %p, inode is %p\n", - cap, ci, &ci->netfs.inode); + doutc(cl, " removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->netfs.inode); iputs = ceph_purge_inode_cap(inode, cap, &invalidate); } @@ -1735,7 +1911,7 @@ static void remove_session_caps(struct ceph_mds_session *session) struct super_block *sb = fsc->sb; LIST_HEAD(dispose); - dout("remove_session_caps on %p\n", session); + doutc(fsc->client, "on %p\n", session); ceph_iterate_session_caps(session, remove_session_caps_cb, fsc); wake_up_all(&fsc->mdsc->cap_flushing_wq); @@ -1816,7 +1992,9 @@ static int wake_up_session_cb(struct inode *inode, int mds, void *arg) static void wake_up_session_caps(struct ceph_mds_session *session, int ev) { - dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); + struct ceph_client *cl = session->s_mdsc->fsc->client; + + doutc(cl, "session %p mds%d\n", session, session->s_mds); ceph_iterate_session_caps(session, wake_up_session_cb, (void *)(unsigned long)ev); } @@ -1830,25 +2008,26 @@ static void wake_up_session_caps(struct ceph_mds_session *session, int ev) static int send_renew_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; int state; if (time_after_eq(jiffies, session->s_cap_ttl) && time_after_eq(session->s_cap_ttl, session->s_renew_requested)) - pr_info("mds%d caps stale\n", session->s_mds); + pr_info_client(cl, "mds%d caps stale\n", session->s_mds); session->s_renew_requested = jiffies; /* do not try to renew caps until a recovering mds has reconnected * with its clients. */ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); if (state < CEPH_MDS_STATE_RECONNECT) { - dout("send_renew_caps ignoring mds%d (%s)\n", - session->s_mds, ceph_mds_state_name(state)); + doutc(cl, "ignoring mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); return 0; } - dout("send_renew_caps to mds%d (%s)\n", session->s_mds, - ceph_mds_state_name(state)); + doutc(cl, "to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); if (!msg) @@ -1860,10 +2039,11 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, static int send_flushmsg_ack(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, u64 seq) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; - dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", - session->s_mds, ceph_session_state_name(session->s_state), seq); + doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds, + ceph_session_state_name(session->s_state), seq); msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; @@ -1880,6 +2060,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc, static void renewed_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int is_renew) { + struct ceph_client *cl = mdsc->fsc->client; int was_stale; int wake = 0; @@ -1891,15 +2072,17 @@ static void renewed_caps(struct ceph_mds_client *mdsc, if (was_stale) { if (time_before(jiffies, session->s_cap_ttl)) { - pr_info("mds%d caps renewed\n", session->s_mds); + pr_info_client(cl, "mds%d caps renewed\n", + session->s_mds); wake = 1; } else { - pr_info("mds%d caps still stale\n", session->s_mds); + pr_info_client(cl, "mds%d caps still stale\n", + session->s_mds); } } - dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", - session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", - time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); + doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds, + session->s_cap_ttl, was_stale ? "stale" : "fresh", + time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); spin_unlock(&session->s_cap_lock); if (wake) @@ -1911,11 +2094,11 @@ static void renewed_caps(struct ceph_mds_client *mdsc, */ static int request_close_session(struct ceph_mds_session *session) { + struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; - dout("request_close_session mds%d state %s seq %lld\n", - session->s_mds, ceph_session_state_name(session->s_state), - session->s_seq); + doutc(cl, "mds%d state %s seq %lld\n", session->s_mds, + ceph_session_state_name(session->s_state), session->s_seq); msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); if (!msg) @@ -1971,6 +2154,8 @@ out: */ static int trim_caps_cb(struct inode *inode, int mds, void *arg) { + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; @@ -1990,9 +2175,10 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", - inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), - ceph_cap_string(used), ceph_cap_string(wanted)); + doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(mine), + ceph_cap_string(oissued), ceph_cap_string(used), + ceph_cap_string(wanted)); if (cap == ci->i_auth_cap) { if (ci->i_dirty_caps || ci->i_flushing_caps || !list_empty(&ci->i_cap_snaps)) @@ -2018,7 +2204,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ - ceph_remove_cap(cap, true); + ceph_remove_cap(mdsc, cap, true); (*remaining)--; } else { struct dentry *dentry; @@ -2032,8 +2218,8 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) count = atomic_read(&inode->i_count); if (count == 1) (*remaining)--; - dout("trim_caps_cb %p cap %p pruned, count now %d\n", - inode, cap, count); + doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", + inode, ceph_vinop(inode), cap, count); } else { dput(dentry); } @@ -2052,17 +2238,18 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int max_caps) { + struct ceph_client *cl = mdsc->fsc->client; int trim_caps = session->s_nr_caps - max_caps; - dout("trim_caps mds%d start: %d / %d, trim %d\n", - session->s_mds, session->s_nr_caps, max_caps, trim_caps); + doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds, + session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { int remaining = trim_caps; ceph_iterate_session_caps(session, trim_caps_cb, &remaining); - dout("trim_caps mds%d done: %d / %d, trimmed %d\n", - session->s_mds, session->s_nr_caps, max_caps, - trim_caps - remaining); + doutc(cl, "mds%d done: %d / %d, trimmed %d\n", + session->s_mds, session->s_nr_caps, max_caps, + trim_caps - remaining); } ceph_flush_cap_releases(mdsc, session); @@ -2072,6 +2259,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, static int check_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { + struct ceph_client *cl = mdsc->fsc->client; int ret = 1; spin_lock(&mdsc->cap_dirty_lock); @@ -2080,8 +2268,8 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, list_first_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); if (cf->tid <= want_flush_tid) { - dout("check_caps_flush still flushing tid " - "%llu <= %llu\n", cf->tid, want_flush_tid); + doutc(cl, "still flushing tid %llu <= %llu\n", + cf->tid, want_flush_tid); ret = 0; } } @@ -2097,12 +2285,14 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { - dout("check_caps_flush want %llu\n", want_flush_tid); + struct ceph_client *cl = mdsc->fsc->client; + + doutc(cl, "want %llu\n", want_flush_tid); wait_event(mdsc->cap_flushing_wq, check_caps_flush(mdsc, want_flush_tid)); - dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); + doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); } /* @@ -2111,6 +2301,7 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; @@ -2169,7 +2360,7 @@ again: msg->front.iov_len += sizeof(*cap_barrier); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + doutc(cl, "mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); msg = NULL; } @@ -2189,13 +2380,13 @@ again: msg->front.iov_len += sizeof(*cap_barrier); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + doutc(cl, "mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); } return; out_err: - pr_err("send_cap_releases mds%d, failed to allocate message\n", - session->s_mds); + pr_err_client(cl, "mds%d, failed to allocate message\n", + session->s_mds); spin_lock(&session->s_cap_lock); list_splice(&tmp_list, &session->s_cap_releases); session->s_num_cap_releases += num_cap_releases; @@ -2218,16 +2409,17 @@ static void ceph_cap_release_work(struct work_struct *work) void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; ceph_get_mds_session(session); if (queue_work(mdsc->fsc->cap_wq, &session->s_cap_release_work)) { - dout("cap release work queued\n"); + doutc(cl, "cap release work queued\n"); } else { ceph_put_mds_session(session); - dout("failed to queue cap release work\n"); + doutc(cl, "failed to queue cap release work\n"); } } @@ -2255,13 +2447,14 @@ static void ceph_cap_reclaim_work(struct work_struct *work) void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { - dout("caps reclaim work queued\n"); + doutc(cl, "caps reclaim work queued\n"); } else { - dout("failed to queue caps release work\n"); + doutc(cl, "failed to queue caps release work\n"); } } @@ -2373,20 +2566,92 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) return mdsc->oldest_tid; } -/* - * Build a dentry's path. Allocate on heap; caller must kfree. Based - * on build_path_from_dentry in fs/cifs/dir.c. +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + struct inode *dir = req->r_parent; + struct dentry *dentry = req->r_dentry; + u8 *cryptbuf = NULL; + u32 len = 0; + int ret = 0; + + /* only encode if we have parent and dentry */ + if (!dir || !dentry) + goto success; + + /* No-op unless this is encrypted */ + if (!IS_ENCRYPTED(dir)) + goto success; + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret < 0) + return ERR_PTR(ret); + + /* No key? Just ignore it. */ + if (!fscrypt_has_encryption_key(dir)) + goto success; + + if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX, + &len)) { + WARN_ON_ONCE(1); + return ERR_PTR(-ENAMETOOLONG); + } + + /* No need to append altname if name is short enough */ + if (len <= CEPH_NOHASH_NAME_MAX) { + len = 0; + goto success; + } + + cryptbuf = kmalloc(len, GFP_KERNEL); + if (!cryptbuf) + return ERR_PTR(-ENOMEM); + + ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len); + if (ret) { + kfree(cryptbuf); + return ERR_PTR(ret); + } +success: + *plen = len; + return cryptbuf; +} +#else +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + *plen = 0; + return NULL; +} +#endif + +/** + * ceph_mdsc_build_path - build a path string to a given dentry + * @mdsc: mds client + * @dentry: dentry to which path should be built + * @plen: returned length of string + * @pbase: returned base inode number + * @for_wire: is this path going to be sent to the MDS? * - * If @stop_on_nosnap, generate path relative to the first non-snapped - * inode. + * Build a string that represents the path to the dentry. This is mostly called + * for two different purposes: + * + * 1) we need to build a path string to send to the MDS (for_wire == true) + * 2) we need a path string for local presentation (e.g. debugfs) + * (for_wire == false) + * + * The path is built in reverse, starting with the dentry. Walk back up toward + * the root, building the path until the first non-snapped inode is reached + * (for_wire) or the root inode is reached (!for_wire). * * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ -char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, - int stop_on_nosnap) +char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, + int *plen, u64 *pbase, int for_wire) { - struct dentry *temp; + struct ceph_client *cl = mdsc->fsc->client; + struct dentry *cur; + struct inode *inode; char *path; int pos; unsigned seq; @@ -2403,34 +2668,71 @@ retry: path[pos] = '\0'; seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - temp = dentry; + cur = dget(dentry); for (;;) { - struct inode *inode; + struct dentry *parent; - spin_lock(&temp->d_lock); - inode = d_inode(temp); + spin_lock(&cur->d_lock); + inode = d_inode(cur); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { - dout("build_path path+%d: %p SNAPDIR\n", - pos, temp); - } else if (stop_on_nosnap && inode && dentry != temp && + doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + } else if (for_wire && inode && dentry != cur && ceph_snap(inode) == CEPH_NOSNAP) { - spin_unlock(&temp->d_lock); + spin_unlock(&cur->d_lock); pos++; /* get rid of any prepended '/' */ break; + } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) { + pos -= cur->d_name.len; + if (pos < 0) { + spin_unlock(&cur->d_lock); + break; + } + memcpy(path + pos, cur->d_name.name, cur->d_name.len); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); } else { - pos -= temp->d_name.len; + int len, ret; + char buf[NAME_MAX]; + + /* + * Proactively copy name into buf, in case we need to + * present it as-is. + */ + memcpy(buf, cur->d_name.name, cur->d_name.len); + len = cur->d_name.len; + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + + ret = ceph_fscrypt_prepare_readdir(d_inode(parent)); + if (ret < 0) { + dput(parent); + dput(cur); + return ERR_PTR(ret); + } + + if (fscrypt_has_encryption_key(d_inode(parent))) { + len = ceph_encode_encrypted_fname(d_inode(parent), + cur, buf); + if (len < 0) { + dput(parent); + dput(cur); + return ERR_PTR(len); + } + } + pos -= len; if (pos < 0) { - spin_unlock(&temp->d_lock); + dput(parent); break; } - memcpy(path + pos, temp->d_name.name, temp->d_name.len); + memcpy(path + pos, buf, len); } - spin_unlock(&temp->d_lock); - temp = READ_ONCE(temp->d_parent); + dput(cur); + cur = parent; /* Are we at the root? */ - if (IS_ROOT(temp)) + if (IS_ROOT(cur)) break; /* Are we out of buffer? */ @@ -2439,8 +2741,9 @@ retry: path[pos] = '/'; } - base = ceph_ino(d_inode(temp)); - rcu_read_unlock(); + inode = d_inode(cur); + base = inode ? ceph_ino(inode) : 0; + dput(cur); if (read_seqretry(&rename_lock, seq)) goto retry; @@ -2450,28 +2753,29 @@ retry: * A rename didn't occur, but somehow we didn't end up where * we thought we would. Throw a warning and try again. */ - pr_warn("build_path did not end path lookup where " - "expected, pos is %d\n", pos); + pr_warn_client(cl, "did not end path lookup where expected (pos = %d)\n", + pos); goto retry; } *pbase = base; *plen = PATH_MAX - 1 - pos; - dout("build_path on %p %d built %llx '%.*s'\n", - dentry, d_count(dentry), base, *plen, path + pos); + doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), + base, *plen, path + pos); return path + pos; } -static int build_dentry_path(struct dentry *dentry, struct inode *dir, - const char **ppath, int *ppathlen, u64 *pino, - bool *pfreepath, bool parent_locked) +static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, + struct inode *dir, const char **ppath, int *ppathlen, + u64 *pino, bool *pfreepath, bool parent_locked) { char *path; rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); - if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) { + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && + !IS_ENCRYPTED(dir)) { *pino = ceph_ino(dir); rcu_read_unlock(); *ppath = dentry->d_name.name; @@ -2479,7 +2783,7 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, return 0; } rcu_read_unlock(); - path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; @@ -2491,6 +2795,7 @@ static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, bool *pfreepath) { + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; @@ -2500,7 +2805,7 @@ static int build_inode_path(struct inode *inode, return 0; } dentry = d_find_alias(inode); - path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); + path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); @@ -2513,34 +2818,35 @@ static int build_inode_path(struct inode *inode, * request arguments may be specified via an inode *, a dentry *, or * an explicit ino+path. */ -static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, - struct inode *rdiri, const char *rpath, - u64 rino, const char **ppath, int *pathlen, - u64 *ino, bool *freepath, bool parent_locked) +static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, + struct dentry *rdentry, struct inode *rdiri, + const char *rpath, u64 rino, const char **ppath, + int *pathlen, u64 *ino, bool *freepath, + bool parent_locked) { + struct ceph_client *cl = mdsc->fsc->client; int r = 0; if (rinode) { r = build_inode_path(rinode, ppath, pathlen, ino, freepath); - dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), - ceph_snap(rinode)); + doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), + ceph_snap(rinode)); } else if (rdentry) { - r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, + r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, freepath, parent_locked); - dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, - *ppath); + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { *ino = rino; *ppath = rpath; *pathlen = rpath ? strlen(rpath) : 0; - dout(" path %.*s\n", *pathlen, rpath); + doutc(cl, " path %.*s\n", *pathlen, rpath); } return r; } -static void encode_timestamp_and_gids(void **p, - const struct ceph_mds_request *req) +static void encode_mclientrequest_tail(void **p, + const struct ceph_mds_request *req) { struct ceph_timespec ts; int i; @@ -2548,11 +2854,54 @@ static void encode_timestamp_and_gids(void **p, ceph_encode_timespec64(&ts, &req->r_stamp); ceph_encode_copy(p, &ts, sizeof(ts)); - /* gid_list */ + /* v4: gid_list */ ceph_encode_32(p, req->r_cred->group_info->ngroups); for (i = 0; i < req->r_cred->group_info->ngroups; i++) ceph_encode_64(p, from_kgid(&init_user_ns, req->r_cred->group_info->gid[i])); + + /* v5: altname */ + ceph_encode_32(p, req->r_altname_len); + ceph_encode_copy(p, req->r_altname, req->r_altname_len); + + /* v6: fscrypt_auth and fscrypt_file */ + if (req->r_fscrypt_auth) { + u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + ceph_encode_32(p, authlen); + ceph_encode_copy(p, req->r_fscrypt_auth, authlen); + } else { + ceph_encode_32(p, 0); + } + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) { + ceph_encode_32(p, sizeof(__le64)); + ceph_encode_64(p, req->r_fscrypt_file); + } else { + ceph_encode_32(p, 0); + } +} + +static inline u16 mds_supported_head_version(struct ceph_mds_session *session) +{ + if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features)) + return 1; + + if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) + return 2; + + return CEPH_MDS_REQUEST_HEAD_VERSION; +} + +static struct ceph_mds_request_head_legacy * +find_legacy_request_head(void *p, u64 features) +{ + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); + struct ceph_mds_request_head_old *ohead; + + if (legacy) + return (struct ceph_mds_request_head_legacy *)p; + ohead = (struct ceph_mds_request_head_old *)p; + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; } /* @@ -2564,8 +2913,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; - struct ceph_mds_request_head_old *head; + struct ceph_mds_request_head_legacy *lhead; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; @@ -2577,8 +2927,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, void *p, *end; int ret; bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); + u16 request_head_version = mds_supported_head_version(session); + kuid_t caller_fsuid = req->r_cred->fsuid; + kgid_t caller_fsgid = req->r_cred->fsgid; - ret = set_request_path_attr(req->r_inode, req->r_dentry, + ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, &path1, &pathlen1, &ino1, &freepath1, test_bit(CEPH_MDS_R_PARENT_LOCKED, @@ -2592,7 +2945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; - ret = set_request_path_attr(NULL, old_dentry, + ret = set_request_path_attr(mdsc, NULL, old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2, true); @@ -2601,12 +2954,34 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, goto out_free1; } - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); - len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + - sizeof(struct ceph_timespec); - len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + req->r_altname = get_fscrypt_altname(req, &req->r_altname_len); + if (IS_ERR(req->r_altname)) { + msg = ERR_CAST(req->r_altname); + req->r_altname = NULL; + goto out_free2; + } - /* calculate (max) length for cap releases */ + /* + * For old cephs without supporting the 32bit retry/fwd feature + * it will copy the raw memories directly when decoding the + * requests. While new cephs will decode the head depending the + * version member, so we need to make sure it will be compatible + * with them both. + */ + if (legacy) + len = sizeof(struct ceph_mds_request_head_legacy); + else if (request_head_version == 1) + len = sizeof(struct ceph_mds_request_head_old); + else if (request_head_version == 2) + len = offsetofend(struct ceph_mds_request_head, ext_num_fwd); + else + len = sizeof(struct ceph_mds_request_head); + + /* filepaths */ + len += 2 * (1 + sizeof(u32) + sizeof(u64)); + len += pathlen1 + pathlen2; + + /* cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); @@ -2616,6 +2991,27 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, if (req->r_old_dentry_drop) len += pathlen2; + /* MClientRequest tail */ + + /* req->r_stamp */ + len += sizeof(struct ceph_timespec); + + /* gid list */ + len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + + /* alternate name */ + len += sizeof(u32) + req->r_altname_len; + + /* fscrypt_auth */ + len += sizeof(u32); // fscrypt_auth + if (req->r_fscrypt_auth) + len += ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + /* fscrypt_file */ + len += sizeof(u32); + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) + len += sizeof(__le64); + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); @@ -2624,33 +3020,87 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.tid = cpu_to_le64(req->r_tid); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + + if ((req->r_mnt_idmap != &nop_mnt_idmap) && + !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) { + WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op)); + + if (enable_unsafe_idmap) { + pr_warn_once_client(cl, + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" + " is not supported by MDS. UID/GID-based restrictions may" + " not work properly.\n"); + + caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, + VFSUIDT_INIT(req->r_cred->fsuid)); + caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, + VFSGIDT_INIT(req->r_cred->fsgid)); + } else { + pr_err_ratelimited_client(cl, + "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" + " is not supported by MDS. Fail request with -EIO.\n"); + + ret = -EIO; + goto out_err; + } + } + /* - * The old ceph_mds_request_head didn't contain a version field, and + * The ceph_mds_request_head_legacy didn't contain a version field, and * one was added when we moved the message version from 3->4. */ if (legacy) { msg->hdr.version = cpu_to_le16(3); - head = msg->front.iov_base; - p = msg->front.iov_base + sizeof(*head); - } else { - struct ceph_mds_request_head *new_head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*lhead); + } else if (request_head_version == 1) { + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(4); - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; - p = msg->front.iov_base + sizeof(*new_head); + ohead->version = cpu_to_le16(1); + p = msg->front.iov_base + sizeof(*ohead); + } else if (request_head_version == 2) { + struct ceph_mds_request_head *nhead = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(6); + nhead->version = cpu_to_le16(2); + + p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd); + } else { + struct ceph_mds_request_head *nhead = msg->front.iov_base; + kuid_t owner_fsuid; + kgid_t owner_fsgid; + + msg->hdr.version = cpu_to_le16(6); + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head)); + + if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) { + owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, + VFSUIDT_INIT(req->r_cred->fsuid)); + owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, + VFSGIDT_INIT(req->r_cred->fsgid)); + nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid)); + nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid)); + } else { + nhead->owner_uid = cpu_to_le32(-1); + nhead->owner_gid = cpu_to_le32(-1); + } + + p = msg->front.iov_base + sizeof(*nhead); } end = msg->front.iov_base + msg->front.iov_len; - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); - head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, - req->r_cred->fsuid)); - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, - req->r_cred->fsgid)); - head->ino = cpu_to_le64(req->r_deleg_ino); - head->args = req->r_args; + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + lhead->op = cpu_to_le32(req->r_op); + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, + caller_fsuid)); + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, + caller_fsgid)); + lhead->ino = cpu_to_le64(req->r_deleg_ino); + lhead->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); @@ -2665,15 +3115,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, req->r_op == CEPH_MDS_OP_READDIR); - if (req->r_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_dentry, + if (req->r_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, req->r_dentry_unless); - if (req->r_old_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + if (ret < 0) + goto out_err; + releases += ret; + } + if (req->r_old_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_old_dentry, req->r_old_dentry_dir, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + if (ret < 0) + goto out_err; + releases += ret; + } if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), @@ -2684,9 +3142,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, p = msg->front.iov_base + req->r_request_release_offset; } - head->num_releases = cpu_to_le16(releases); + lhead->num_releases = cpu_to_le16(releases); - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); @@ -2715,6 +3173,10 @@ out_free1: ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; +out_err: + ceph_msg_put(msg); + msg = ERR_PTR(ret); + goto out_free2; } /* @@ -2731,18 +3193,6 @@ static void complete_request(struct ceph_mds_client *mdsc, complete_all(&req->r_completion); } -static struct ceph_mds_request_head_old * -find_old_request_head(void *p, u64 features) -{ - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); - struct ceph_mds_request_head *new_head; - - if (legacy) - return (struct ceph_mds_request_head_old *)p; - new_head = (struct ceph_mds_request_head *)p; - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; -} - /* * called under mdsc->mutex */ @@ -2752,29 +3202,29 @@ static int __prepare_send_request(struct ceph_mds_session *session, { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; - struct ceph_mds_request_head_old *rhead; + struct ceph_client *cl = mdsc->fsc->client; + struct ceph_mds_request_head_legacy *lhead; + struct ceph_mds_request_head *nhead; struct ceph_msg *msg; - int flags = 0, max_retry; + int flags = 0, old_max_retry; + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, + &session->s_features); /* - * The type of 'r_attempts' in kernel 'ceph_mds_request' - * is 'int', while in 'ceph_mds_request_head' the type of - * 'num_retry' is '__u8'. So in case the request retries - * exceeding 256 times, the MDS will receive a incorrect - * retry seq. - * - * In this case it's ususally a bug in MDS and continue - * retrying the request makes no sense. - * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + * Avoid inifinite retrying after overflow. The client will + * increase the retry count and if the MDS is old version, + * so we limit to retry at most 256 times. */ - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); - max_retry = 1 << (max_retry * BITS_PER_BYTE); - if (req->r_attempts >= max_retry) { - pr_warn_ratelimited("%s request tid %llu seq overflow\n", - __func__, req->r_tid); - return -EMULTIHOP; + if (req->r_attempts) { + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, + num_retry); + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); + if ((old_version && req->r_attempts >= old_max_retry) || + ((uint32_t)req->r_attempts >= U32_MAX)) { + pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", + req->r_tid); + return -EMULTIHOP; + } } req->r_attempts++; @@ -2787,8 +3237,8 @@ static int __prepare_send_request(struct ceph_mds_session *session, else req->r_sent_on_mseq = -1; } - dout("%s %p tid %lld %s (attempt %d)\n", __func__, req, - req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid, + ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; @@ -2800,23 +3250,27 @@ static int __prepare_send_request(struct ceph_mds_session *session, * d_move mangles the src name. */ msg = req->r_request; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); - flags = le32_to_cpu(rhead->flags); + flags = le32_to_cpu(lhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; - rhead->flags = cpu_to_le32(flags); + lhead->flags = cpu_to_le32(flags); if (req->r_target_inode) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - rhead->num_retry = req->r_attempts - 1; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } /* remove cap/dentry releases from message */ - rhead->num_releases = 0; + lhead->num_releases = 0; p = msg->front.iov_base + req->r_request_release_offset; - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2834,20 +3288,25 @@ static int __prepare_send_request(struct ceph_mds_session *session, } req->r_request = msg; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; - rhead->flags = cpu_to_le32(flags); - rhead->num_fwd = req->r_num_fwd; - rhead->num_retry = req->r_attempts - 1; + lhead->flags = cpu_to_le32(flags); + lhead->num_fwd = req->r_num_fwd; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } - dout(" r_parent = %p\n", req->r_parent); + doutc(cl, " r_parent = %p\n", req->r_parent); return 0; } @@ -2875,6 +3334,7 @@ static int __send_request(struct ceph_mds_session *session, static void __do_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session = NULL; int mds = -1; int err = 0; @@ -2887,29 +3347,29 @@ static void __do_request(struct ceph_mds_client *mdsc, } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { - dout("do_request metadata corrupted\n"); + doutc(cl, "metadata corrupted\n"); err = -EIO; goto finish; } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { - dout("do_request timed out\n"); + doutc(cl, "timed out\n"); err = -ETIMEDOUT; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("do_request forced umount\n"); + doutc(cl, "forced umount\n"); err = -EIO; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { if (mdsc->mdsmap_err) { err = mdsc->mdsmap_err; - dout("do_request mdsmap err %d\n", err); + doutc(cl, "mdsmap err %d\n", err); goto finish; } if (mdsc->mdsmap->m_epoch == 0) { - dout("do_request no mdsmap, waiting for map\n"); + doutc(cl, "no mdsmap, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } @@ -2930,7 +3390,7 @@ static void __do_request(struct ceph_mds_client *mdsc, err = -EJUKEBOX; goto finish; } - dout("do_request no mds or not active, waiting for map\n"); + doutc(cl, "no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } @@ -2946,8 +3406,8 @@ static void __do_request(struct ceph_mds_client *mdsc, } req->r_session = ceph_get_mds_session(session); - dout("do_request mds%d session %p state %s\n", mds, session, - ceph_session_state_name(session->s_state)); + doutc(cl, "mds%d session %p state %s\n", mds, session, + ceph_session_state_name(session->s_state)); /* * The old ceph will crash the MDSs when see unknown OPs @@ -3038,8 +3498,8 @@ static void __do_request(struct ceph_mds_client *mdsc, spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { - dout("do_request session changed for auth cap %d -> %d\n", - cap->session->s_mds, session->s_mds); + doutc(cl, "session changed for auth cap %d -> %d\n", + cap->session->s_mds, session->s_mds); /* Remove the auth cap from old session */ spin_lock(&cap->session->s_cap_lock); @@ -3066,7 +3526,7 @@ out_session: ceph_put_mds_session(session); finish: if (err) { - dout("__do_request early error %d\n", err); + doutc(cl, "early error %d\n", err); req->r_err = err; complete_request(mdsc, req); __unregister_request(mdsc, req); @@ -3080,6 +3540,7 @@ finish: static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; LIST_HEAD(tmp_list); @@ -3089,7 +3550,8 @@ static void __wake_requests(struct ceph_mds_client *mdsc, req = list_entry(tmp_list.next, struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); - dout(" wake request %p tid %llu\n", req, req->r_tid); + doutc(cl, " wake request %p tid %llu\n", req, + req->r_tid); __do_request(mdsc, req); } } @@ -3100,10 +3562,11 @@ static void __wake_requests(struct ceph_mds_client *mdsc, */ static void kick_requests(struct ceph_mds_client *mdsc, int mds) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct rb_node *p = rb_first(&mdsc->request_tree); - dout("kick_requests mds%d\n", mds); + doutc(cl, "kick_requests mds%d\n", mds); while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); @@ -3113,7 +3576,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { - dout(" kicking tid %llu\n", req->r_tid); + doutc(cl, " kicking tid %llu\n", req->r_tid); list_del_init(&req->r_wait); __do_request(mdsc, req); } @@ -3123,6 +3586,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { + struct ceph_client *cl = mdsc->fsc->client; int err = 0; /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ @@ -3144,8 +3608,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, if (req->r_inode) { err = ceph_wait_on_async_create(req->r_inode); if (err) { - dout("%s: wait for async create returned: %d\n", - __func__, err); + doutc(cl, "wait for async create returned: %d\n", err); return err; } } @@ -3153,13 +3616,12 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, if (!err && req->r_old_inode) { err = ceph_wait_on_async_create(req->r_old_inode); if (err) { - dout("%s: wait for async create returned: %d\n", - __func__, err); + doutc(cl, "wait for async create returned: %d\n", err); return err; } } - dout("submit_request on %p for inode %p\n", req, dir); + doutc(cl, "submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); __do_request(mdsc, req); @@ -3172,10 +3634,11 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, ceph_mds_request_wait_callback_t wait_func) { + struct ceph_client *cl = mdsc->fsc->client; int err; /* wait */ - dout("do_request waiting\n"); + doutc(cl, "do_request waiting\n"); if (wait_func) { err = wait_func(mdsc, req); } else { @@ -3189,14 +3652,14 @@ int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, else err = timeleft; /* killed */ } - dout("do_request waited, got %d\n", err); + doutc(cl, "do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); /* only abort if we didn't race with a real reply */ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { err = le32_to_cpu(req->r_reply_info.head->result); } else if (err < 0) { - dout("aborted request %lld with %d\n", req->r_tid, err); + doutc(cl, "aborted request %lld with %d\n", req->r_tid, err); /* * ensure we aren't running concurrently with @@ -3227,15 +3690,16 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { + struct ceph_client *cl = mdsc->fsc->client; int err; - dout("do_request on %p\n", req); + doutc(cl, "do_request on %p\n", req); /* issue */ err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) err = ceph_mdsc_wait_request(mdsc, req, NULL); - dout("do_request %p done, result %d\n", req, err); + doutc(cl, "do_request %p done, result %d\n", req, err); return err; } @@ -3247,8 +3711,10 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *dir = req->r_parent; struct inode *old_dir = req->r_old_dentry_dir; + struct ceph_client *cl = req->r_mdsc->fsc->client; - dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir); + doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n", + dir, old_dir); ceph_dir_clear_complete(dir); if (old_dir) @@ -3269,6 +3735,7 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req) static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ @@ -3279,7 +3746,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { - pr_err("mdsc_handle_reply got corrupt (short) reply\n"); + pr_err_client(cl, "got corrupt (short) reply\n"); ceph_msg_dump(msg); return; } @@ -3289,17 +3756,17 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { - dout("handle_reply on unknown tid %llu\n", tid); + doutc(cl, "on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); return; } - dout("handle_reply %p\n", req); + doutc(cl, "handle_reply %p\n", req); /* correct session? */ if (req->r_session != session) { - pr_err("mdsc_handle_reply got %llu on session mds%d" - " not mds%d\n", tid, session->s_mds, - req->r_session ? req->r_session->s_mds : -1); + pr_err_client(cl, "got %llu on session mds%d not mds%d\n", + tid, session->s_mds, + req->r_session ? req->r_session->s_mds : -1); mutex_unlock(&mdsc->mutex); goto out; } @@ -3307,14 +3774,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* dup? */ if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { - pr_warn("got a dup %s reply on %llu from mds%d\n", - head->safe ? "safe" : "unsafe", tid, mds); + pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n", + head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { - pr_warn("got unsafe after safe on %llu from mds%d\n", - tid, mds); + pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n", + tid, mds); mutex_unlock(&mdsc->mutex); goto out; } @@ -3337,7 +3804,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) * response. And even if it did, there is nothing * useful we could do with a revised return value. */ - dout("got safe reply %llu, mds%d\n", tid, mds); + doutc(cl, "got safe reply %llu, mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; @@ -3347,23 +3814,36 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } - dout("handle_reply tid %lld result %d\n", tid, result); - rinfo = &req->r_reply_info; + doutc(cl, "tid %lld result %d\n", tid, result); if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) - err = parse_reply_info(session, msg, rinfo, (u64)-1); + err = parse_reply_info(session, msg, req, (u64)-1); else - err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); + err = parse_reply_info(session, msg, req, + session->s_con.peer_features); mutex_unlock(&mdsc->mutex); /* Must find target inode outside of mutexes to avoid deadlocks */ + rinfo = &req->r_reply_info; if ((err >= 0) && rinfo->head->is_target) { - struct inode *in; + struct inode *in = xchg(&req->r_new_inode, NULL); struct ceph_vino tvino = { .ino = le64_to_cpu(rinfo->targeti.in->ino), .snap = le64_to_cpu(rinfo->targeti.in->snapid) }; - in = ceph_get_inode(mdsc->fsc->sb, tvino); + /* + * If we ended up opening an existing inode, discard + * r_new_inode + */ + if (req->r_op == CEPH_MDS_OP_CREATE && + !req->r_reply_info.has_create_ino) { + /* This should never happen on an async create */ + WARN_ON_ONCE(req->r_deleg_ino); + iput(in); + in = NULL; + } + + in = ceph_get_inode(mdsc->fsc->sb, tvino, in); if (IS_ERR(in)) { err = PTR_ERR(in); mutex_lock(&session->s_mutex); @@ -3374,7 +3854,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&session->s_mutex); if (err < 0) { - pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); + pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n", + mds, tid); ceph_msg_dump(msg); goto out_err; } @@ -3406,7 +3887,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) - ceph_readdir_prepopulate(req, req->r_session); + err = ceph_readdir_prepopulate(req, req->r_session); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); @@ -3438,7 +3919,7 @@ out_err: set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); } } else { - dout("reply arrived after request %lld was aborted\n", tid); + doutc(cl, "reply arrived after request %lld was aborted\n", tid); } mutex_unlock(&mdsc->mutex); @@ -3467,6 +3948,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; u64 tid = le64_to_cpu(msg->hdr.tid); u32 next_mds; @@ -3484,43 +3966,32 @@ static void handle_forward(struct ceph_mds_client *mdsc, req = lookup_get_request(mdsc, tid); if (!req) { mutex_unlock(&mdsc->mutex); - dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); + doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds); return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { - dout("forward tid %llu aborted, unregistering\n", tid); + doutc(cl, "forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); - } else if (fwd_seq <= req->r_num_fwd) { + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { /* - * The type of 'num_fwd' in ceph 'MClientRequestForward' - * is 'int32_t', while in 'ceph_mds_request_head' the - * type is '__u8'. So in case the request bounces between - * MDSes exceeding 256 times, the client will get stuck. - * - * In this case it's ususally a bug in MDS and continue - * bouncing the request makes no sense. + * Avoid inifinite retrying after overflow. * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + * The MDS will increase the fwd count and in client side + * if the num_fwd is less than the one saved in request + * that means the MDS is an old version and overflowed of + * 8 bits. */ - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); - max = 1 << (max * BITS_PER_BYTE); - if (req->r_num_fwd >= max) { - mutex_lock(&req->r_fill_mutex); - req->r_err = -EMULTIHOP; - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); - mutex_unlock(&req->r_fill_mutex); - aborted = true; - pr_warn_ratelimited("forward tid %llu seq overflow\n", - tid); - } else { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); - } + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", + tid); } else { /* resend. forward race not possible; mds would drop */ - dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); + doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); req->r_attempts = 0; @@ -3538,7 +4009,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, return; bad: - pr_err("mdsc_handle_forward decode error err=%d\n", err); + pr_err_client(cl, "decode error err=%d\n", err); ceph_msg_dump(msg); } @@ -3577,6 +4048,7 @@ static void handle_session(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; int mds = session->s_mds; int msg_version = le16_to_cpu(msg->hdr.version); void *p = msg->front.iov_base; @@ -3624,7 +4096,8 @@ static void handle_session(struct ceph_mds_session *session, /* version >= 5, flags */ ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { - pr_warn("mds%d session blocklisted\n", session->s_mds); + pr_warn_client(cl, "mds%d session blocklisted\n", + session->s_mds); blocklisted = true; } } @@ -3640,22 +4113,24 @@ static void handle_session(struct ceph_mds_session *session, mutex_lock(&session->s_mutex); - dout("handle_session mds%d %s %p state %s seq %llu\n", - mds, ceph_session_op_name(op), session, - ceph_session_state_name(session->s_state), seq); + doutc(cl, "mds%d %s %p state %s seq %llu\n", mds, + ceph_session_op_name(op), session, + ceph_session_state_name(session->s_state), seq); if (session->s_state == CEPH_MDS_SESSION_HUNG) { session->s_state = CEPH_MDS_SESSION_OPEN; - pr_info("mds%d came back\n", session->s_mds); + pr_info_client(cl, "mds%d came back\n", session->s_mds); } switch (op) { case CEPH_SESSION_OPEN: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) - pr_info("mds%d reconnect success\n", session->s_mds); + pr_info_client(cl, "mds%d reconnect success\n", + session->s_mds); if (session->s_state == CEPH_MDS_SESSION_OPEN) { - pr_notice("mds%d is already opened\n", session->s_mds); + pr_notice_client(cl, "mds%d is already opened\n", + session->s_mds); } else { session->s_state = CEPH_MDS_SESSION_OPEN; session->s_features = features; @@ -3685,7 +4160,8 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) - pr_info("mds%d reconnect denied\n", session->s_mds); + pr_info_client(cl, "mds%d reconnect denied\n", + session->s_mds); session->s_state = CEPH_MDS_SESSION_CLOSED; cleanup_session_requests(mdsc, session); remove_session_caps(session); @@ -3694,8 +4170,8 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_STALE: - pr_info("mds%d caps went stale, renewing\n", - session->s_mds); + pr_info_client(cl, "mds%d caps went stale, renewing\n", + session->s_mds); atomic_inc(&session->s_cap_gen); session->s_cap_ttl = jiffies - 1; send_renew_caps(mdsc, session); @@ -3716,7 +4192,7 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_FORCE_RO: - dout("force_session_readonly %p\n", session); + doutc(cl, "force_session_readonly %p\n", session); spin_lock(&session->s_cap_lock); session->s_readonly = true; spin_unlock(&session->s_cap_lock); @@ -3725,7 +4201,8 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_REJECT: WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING); - pr_info("mds%d rejected session\n", session->s_mds); + pr_info_client(cl, "mds%d rejected session\n", + session->s_mds); session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); @@ -3735,7 +4212,7 @@ static void handle_session(struct ceph_mds_session *session, break; default: - pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); + pr_err_client(cl, "bad op %d mds%d\n", op, mds); WARN_ON(1); } @@ -3752,30 +4229,32 @@ static void handle_session(struct ceph_mds_session *session, return; bad: - pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds, - (int)msg->front.iov_len); + pr_err_client(cl, "corrupt message mds%d len %d\n", mds, + (int)msg->front.iov_len); ceph_msg_dump(msg); return; } void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) { + struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { - dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps); } } void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req) { + struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { - dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent), dcaps); } @@ -3790,7 +4269,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, *nreq; struct rb_node *p; - dout("replay_unsafe_requests mds%d\n", session->s_mds); + doutc(mdsc->fsc->client, "mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) @@ -3934,6 +4413,8 @@ out_unlock: */ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) { + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); union { struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; @@ -3951,7 +4432,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ - path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, + path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { @@ -3970,9 +4451,9 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) err = 0; goto out_err; } - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", - inode, ceph_vinop(inode), cap, cap->cap_id, - ceph_cap_string(cap->issued)); + doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode, + ceph_vinop(inode), cap, cap->cap_id, + ceph_cap_string(cap->issued)); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ @@ -3997,12 +4478,16 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { + struct timespec64 ts; + rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(i_size_read(inode)); - ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); - ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); + ts = inode_get_mtime(inode); + ceph_encode_timespec64(&rec.v1.mtime, &ts); + ts = inode_get_atime(inode); + ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } @@ -4122,6 +4607,7 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc, { struct rb_node *p; struct ceph_pagelist *pagelist = recon_state->pagelist; + struct ceph_client *cl = mdsc->fsc->client; int err = 0; if (recon_state->msg_version >= 4) { @@ -4160,8 +4646,8 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc, ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); } - dout(" adding snap realm %llx seq %lld parent %llx\n", - realm->ino, realm->seq, realm->parent_ino); + doutc(cl, " adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); sr_rec.ino = cpu_to_le64(realm->ino); sr_rec.seq = cpu_to_le64(realm->seq); sr_rec.parent = cpu_to_le64(realm->parent_ino); @@ -4190,6 +4676,7 @@ fail: static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *reply; int mds = session->s_mds; int err = -ENOMEM; @@ -4198,7 +4685,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, }; LIST_HEAD(dispose); - pr_info("mds%d reconnect start\n", mds); + pr_info_client(cl, "mds%d reconnect start\n", mds); recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!recon_state.pagelist) @@ -4214,8 +4701,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; - dout("session %p state %s\n", session, - ceph_session_state_name(session->s_state)); + doutc(cl, "session %p state %s\n", session, + ceph_session_state_name(session->s_state)); atomic_inc(&session->s_cap_gen); @@ -4349,7 +4836,8 @@ fail: fail_nomsg: ceph_pagelist_release(recon_state.pagelist); fail_nopagelist: - pr_err("error %d preparing reconnect for mds%d\n", err, mds); + pr_err_client(cl, "error %d preparing reconnect for mds%d\n", + err, mds); return; } @@ -4368,9 +4856,9 @@ static void check_new_map(struct ceph_mds_client *mdsc, int oldstate, newstate; struct ceph_mds_session *s; unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0}; + struct ceph_client *cl = mdsc->fsc->client; - dout("check_new_map new %u old %u\n", - newmap->m_epoch, oldmap->m_epoch); + doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); if (newmap->m_info) { for (i = 0; i < newmap->possible_max_rank; i++) { @@ -4386,12 +4874,12 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); - dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", - i, ceph_mds_state_name(oldstate), - ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", - ceph_mds_state_name(newstate), - ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", - ceph_session_state_name(s->s_state)); + doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n", + i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", + ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", + ceph_session_state_name(s->s_state)); if (i >= newmap->possible_max_rank) { /* force close session for stopped mds */ @@ -4444,7 +4932,8 @@ static void check_new_map(struct ceph_mds_client *mdsc, newstate >= CEPH_MDS_STATE_ACTIVE) { if (oldstate != CEPH_MDS_STATE_CREATING && oldstate != CEPH_MDS_STATE_STARTING) - pr_info("mds%d recovery completed\n", s->s_mds); + pr_info_client(cl, "mds%d recovery completed\n", + s->s_mds); kick_requests(mdsc, i); mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); @@ -4488,12 +4977,13 @@ static void check_new_map(struct ceph_mds_client *mdsc, s = __open_export_target_session(mdsc, i); if (IS_ERR(s)) { err = PTR_ERR(s); - pr_err("failed to open export target session, err %d\n", - err); + pr_err_client(cl, + "failed to open export target session, err %d\n", + err); continue; } } - dout("send reconnect to export target mds.%d\n", i); + doutc(cl, "send reconnect to export target mds.%d\n", i); mutex_unlock(&mdsc->mutex); send_mds_reconnect(mdsc, s); ceph_put_mds_session(s); @@ -4509,8 +4999,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG || s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout(" connecting to export targets of laggy mds%d\n", - i); + doutc(cl, " connecting to export targets of laggy mds%d\n", i); __open_export_target_sessions(mdsc, s); } } @@ -4537,6 +5026,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct dentry *parent, *dentry; @@ -4548,7 +5038,10 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct qstr dname; int release = 0; - dout("handle_lease from mds%d\n", mds); + doutc(cl, "from mds%d\n", mds); + + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) @@ -4563,22 +5056,19 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease %s, ino %llx %p %.*s\n", - ceph_lease_op_name(h->action), vino.ino, inode, - dname.len, dname.name); + doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action), + vino.ino, inode, dname.len, dname.name); mutex_lock(&session->s_mutex); - inc_session_sequence(session); - if (!inode) { - dout("handle_lease no inode %llx\n", vino.ino); + doutc(cl, "no inode %llx\n", vino.ino); goto release; } /* dentry */ parent = d_find_alias(inode); if (!parent) { - dout("no parent dentry on inode %p\n", inode); + doutc(cl, "no parent dentry on inode %p\n", inode); WARN_ON(1); goto release; /* hrm... */ } @@ -4631,10 +5121,14 @@ release: out: mutex_unlock(&session->s_mutex); iput(inode); + + ceph_dec_mds_stopping_blocker(mdsc); return; bad: - pr_err("corrupt lease message\n"); + ceph_dec_mds_stopping_blocker(mdsc); + + pr_err_client(cl, "corrupt lease message\n"); ceph_msg_dump(msg); } @@ -4642,13 +5136,14 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, struct dentry *dentry, char action, u32 seq) { + struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_lease *lease; struct inode *dir; int len = sizeof(*lease) + sizeof(u32) + NAME_MAX; - dout("lease_send_msg identry %p %s to mds%d\n", - dentry, ceph_lease_op_name(action), session->s_mds); + doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action), + session->s_mds); msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) @@ -4681,6 +5176,7 @@ static void lock_unlock_session(struct ceph_mds_session *s) static void maybe_recover_session(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_fs_client *fsc = mdsc->fsc; if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) @@ -4692,17 +5188,19 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) if (!READ_ONCE(fsc->blocklisted)) return; - pr_info("auto reconnect after blocklisted\n"); + pr_info_client(cl, "auto reconnect after blocklisted\n"); ceph_force_reconnect(fsc->sb); } bool check_session_state(struct ceph_mds_session *s) { + struct ceph_client *cl = s->s_mdsc->fsc->client; + switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { s->s_state = CEPH_MDS_SESSION_HUNG; - pr_info("mds%d hung\n", s->s_mds); + pr_info_client(cl, "mds%d hung\n", s->s_mds); } break; case CEPH_MDS_SESSION_CLOSING: @@ -4722,6 +5220,8 @@ bool check_session_state(struct ceph_mds_session *s) */ void inc_session_sequence(struct ceph_mds_session *s) { + struct ceph_client *cl = s->s_mdsc->fsc->client; + lockdep_assert_held(&s->s_mutex); s->s_seq++; @@ -4729,11 +5229,11 @@ void inc_session_sequence(struct ceph_mds_session *s) if (s->s_state == CEPH_MDS_SESSION_CLOSING) { int ret; - dout("resending session close request for mds%d\n", s->s_mds); + doutc(cl, "resending session close request for mds%d\n", s->s_mds); ret = request_close_session(s); if (ret < 0) - pr_err("unable to close session to mds%d: %d\n", - s->s_mds, ret); + pr_err_client(cl, "unable to close session to mds%d: %d\n", + s->s_mds, ret); } } @@ -4762,7 +5262,7 @@ static void delayed_work(struct work_struct *work) int renew_caps; int i; - dout("mdsc delayed_work\n"); + doutc(mdsc->fsc->client, "mdsc delayed_work\n"); if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) return; @@ -4829,6 +5329,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) } init_completion(&mdsc->safe_umount_waiters); + spin_lock_init(&mdsc->stopping_lock); + atomic_set(&mdsc->stopping_blockers, 0); + init_completion(&mdsc->stopping_waiter); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; @@ -4888,6 +5391,7 @@ err_mdsc: */ static void wait_requests(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; @@ -4895,25 +5399,25 @@ static void wait_requests(struct ceph_mds_client *mdsc) if (__get_oldest_req(mdsc)) { mutex_unlock(&mdsc->mutex); - dout("wait_requests waiting for requests\n"); + doutc(cl, "waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); while ((req = __get_oldest_req(mdsc))) { - dout("wait_requests timed out on tid %llu\n", - req->r_tid); + doutc(cl, "timed out on tid %llu\n", req->r_tid); list_del_init(&req->r_wait); __unregister_request(mdsc, req); } } mutex_unlock(&mdsc->mutex); - dout("wait_requests done\n"); + doutc(cl, "done\n"); } void send_flush_mdlog(struct ceph_mds_session *s) { + struct ceph_client *cl = s->s_mdsc->fsc->client; struct ceph_msg *msg; /* @@ -4923,13 +5427,13 @@ void send_flush_mdlog(struct ceph_mds_session *s) return; mutex_lock(&s->s_mutex); - dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, - ceph_session_state_name(s->s_state), s->s_seq); + doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, s->s_seq); if (!msg) { - pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n", - s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); } else { ceph_con_send(&s->s_con, msg); } @@ -4942,7 +5446,7 @@ void send_flush_mdlog(struct ceph_mds_session *s) */ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) { - dout("pre_umount\n"); + doutc(mdsc->fsc->client, "begin\n"); mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN; ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); @@ -4957,6 +5461,7 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) ceph_msgr_flush(); ceph_cleanup_quotarealms_inodes(mdsc); + doutc(mdsc->fsc->client, "done\n"); } /* @@ -4965,12 +5470,13 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req = NULL, *nextreq; struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); - dout("%s want %lld\n", __func__, want_tid); + doutc(cl, "want %lld\n", want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { @@ -5004,8 +5510,8 @@ restart: } else { ceph_put_mds_session(s); } - dout("%s wait on %llu (want %llu)\n", __func__, - req->r_tid, want_tid); + doutc(cl, "wait on %llu (want %llu)\n", + req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); mutex_lock(&mdsc->mutex); @@ -5023,17 +5529,18 @@ restart: } mutex_unlock(&mdsc->mutex); ceph_put_mds_session(last_session); - dout("%s done\n", __func__); + doutc(cl, "done\n"); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; u64 want_tid, want_flush; if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) return; - dout("sync\n"); + doutc(cl, "sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; mutex_unlock(&mdsc->mutex); @@ -5049,8 +5556,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) } spin_unlock(&mdsc->cap_dirty_lock); - dout("sync want tid %lld flush_seq %lld\n", - want_tid, want_flush); + doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush); flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); @@ -5072,11 +5578,12 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_options *opts = mdsc->fsc->client->options; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session; int i; int skipped = 0; - dout("close_sessions\n"); + doutc(cl, "begin\n"); /* close sessions */ mutex_lock(&mdsc->mutex); @@ -5094,7 +5601,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) } mutex_unlock(&mdsc->mutex); - dout("waiting for sessions to close\n"); + doutc(cl, "waiting for sessions to close\n"); wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc, skipped), ceph_timeout_jiffies(opts->mount_timeout)); @@ -5122,7 +5629,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ - dout("stopped\n"); + doutc(cl, "done\n"); } void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) @@ -5130,7 +5637,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) struct ceph_mds_session *session; int mds; - dout("force umount\n"); + doutc(mdsc->fsc->client, "force umount\n"); mutex_lock(&mdsc->mutex); for (mds = 0; mds < mdsc->max_sessions; mds++) { @@ -5161,7 +5668,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { - dout("stop\n"); + doutc(mdsc->fsc->client, "stop\n"); /* * Make sure the delayed work stopped before releasing * the resources. @@ -5182,7 +5689,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) void ceph_mdsc_destroy(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc = fsc->mdsc; - dout("mdsc_destroy %p\n", mdsc); + doutc(fsc->client, "%p\n", mdsc); if (!mdsc) return; @@ -5196,12 +5703,13 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) fsc->mdsc = NULL; kfree(mdsc); - dout("mdsc_destroy %p done\n", mdsc); + doutc(fsc->client, "%p done\n", mdsc); } void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { struct ceph_fs_client *fsc = mdsc->fsc; + struct ceph_client *cl = fsc->client; const char *mds_namespace = fsc->mount_options->mds_namespace; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; @@ -5213,7 +5721,7 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ceph_decode_need(&p, end, sizeof(u32), bad); epoch = ceph_decode_32(&p); - dout("handle_fsmap epoch %u\n", epoch); + doutc(cl, "epoch %u\n", epoch); /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */ ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad); @@ -5258,7 +5766,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) return; bad: - pr_err("error decoding fsmap %d. Shutting down mount.\n", err); + pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n", + err); ceph_umount_begin(mdsc->fsc->sb); ceph_msg_dump(msg); err_out: @@ -5273,6 +5782,7 @@ err_out: */ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; u32 epoch; u32 maplen; void *p = msg->front.iov_base; @@ -5287,18 +5797,17 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); - dout("handle_map epoch %u len %d\n", epoch, (int)maplen); + doutc(cl, "epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { - dout("handle_map epoch %u <= our %u\n", - epoch, mdsc->mdsmap->m_epoch); + doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); return; } - newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client)); + newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client)); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad_unlock; @@ -5327,7 +5836,8 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) bad_unlock: mutex_unlock(&mdsc->mutex); bad: - pr_err("error decoding mdsmap %d. Shutting down mount.\n", err); + pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n", + err); ceph_umount_begin(mdsc->fsc->sb); ceph_msg_dump(msg); return; @@ -5358,7 +5868,8 @@ static void mds_peer_reset(struct ceph_connection *con) struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - pr_warn("mds%d closed our session\n", s->s_mds); + pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", + s->s_mds); if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO) send_mds_reconnect(mdsc, s); } @@ -5367,6 +5878,7 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_client *cl = mdsc->fsc->client; int type = le16_to_cpu(msg->hdr.type); mutex_lock(&mdsc->mutex); @@ -5406,8 +5918,8 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) break; default: - pr_err("received unknown message type %d %s\n", type, - ceph_msg_type_name(type)); + pr_err_client(cl, "received unknown message type %d %s\n", + type, ceph_msg_type_name(type)); } out: ceph_msg_put(msg); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 86d2965e68a1..2e6ddaa13d72 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -14,9 +14,9 @@ #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> -#include <linux/ceph/mdsmap.h> #include <linux/ceph/auth.h> +#include "mdsmap.h" #include "metric.h" #include "super.h" @@ -32,8 +32,11 @@ enum ceph_feature_type { CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_32BITS_RETRY_FWD, + CEPHFS_FEATURE_NEW_SNAPREALM_INFO, + CEPHFS_FEATURE_HAS_OWNER_UIDGID, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,8 +47,11 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ + CEPHFS_FEATURE_ALTERNATE_NAME, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ CEPHFS_FEATURE_OP_GETVXATTR, \ + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ + CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ } /* @@ -86,13 +92,19 @@ struct ceph_mds_reply_info_in { s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; + u8 *fscrypt_auth; + u8 *fscrypt_file; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; u64 rsnaps; u64 change_attr; }; struct ceph_mds_reply_dir_entry { + bool is_nokey; char *name; u32 name_len; + u32 raw_hash; struct ceph_mds_reply_lease *lease; struct ceph_mds_reply_info_in inode; loff_t offset; @@ -116,7 +128,9 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; + u8 *altname; u32 dname_len; + u32 altname_len; struct ceph_mds_reply_lease *dlease; struct ceph_mds_reply_xattr xattr_info; @@ -263,6 +277,7 @@ struct ceph_mds_request { struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ + struct inode *r_new_inode; /* new inode (for creates) */ #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ @@ -272,14 +287,23 @@ struct ceph_mds_request { #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ #define CEPH_MDS_R_ASYNC (8) /* async request */ +#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ unsigned long r_req_flags; struct mutex r_fill_mutex; union ceph_mds_request_args r_args; + + struct ceph_fscrypt_auth *r_fscrypt_auth; + u64 r_fscrypt_file; + + u8 *r_altname; /* fscrypt binary crypttext for long filenames */ + u32 r_altname_len; /* length of r_altname */ + int r_fmode; /* file mode, if expecting cap */ int r_request_release_offset; const struct cred *r_cred; + struct mnt_idmap *r_mnt_idmap; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ @@ -381,8 +405,9 @@ struct cap_wait { }; enum { - CEPH_MDSC_STOPPING_BEGIN = 1, - CEPH_MDSC_STOPPING_FLUSHED = 2, + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHING = 2, + CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* @@ -401,7 +426,11 @@ struct ceph_mds_client { struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; int max_sessions; /* len of sessions array */ - int stopping; /* true if shutting down */ + + spinlock_t stopping_lock; /* protect snap_empty */ + int stopping; /* the stage of shutting down */ + atomic_t stopping_blockers; + struct completion stopping_waiter; atomic64_t quotarealms_count; /* # realms with quota */ /* @@ -556,8 +585,9 @@ static inline void ceph_mdsc_free_path(char *path, int len) __putname(path - (PATH_MAX - 1 - len)); } -extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, - int stop_on_nosnap); +extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, + struct dentry *dentry, int *plen, u64 *base, + int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, @@ -589,4 +619,6 @@ static inline int ceph_wait_on_async_create(struct inode *inode) extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); + +extern bool enable_unsafe_idmap; #endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 7dac21ee6ce7..fae97c25ce58 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -7,10 +7,11 @@ #include <linux/slab.h> #include <linux/types.h> -#include <linux/ceph/mdsmap.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> +#include "mdsmap.h" +#include "mds_client.h" #include "super.h" #define CEPH_MDS_IS_READY(i, ignore_laggy) \ @@ -114,8 +115,10 @@ bad: * Ignore any fields we don't care about (there are quite a few of * them). */ -struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) +struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, + void *end, bool msgr2) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mdsmap *m; const void *start = *p; int i, j, n; @@ -233,20 +236,18 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) *p = info_end; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", - i+1, n, global_id, mds, inc, - ceph_pr_addr(&addr), - ceph_mds_state_name(state), - laggy ? "(laggy)" : ""); + doutc(cl, "%d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, + mds, inc, ceph_pr_addr(&addr), + ceph_mds_state_name(state), laggy ? "(laggy)" : ""); if (mds < 0 || mds >= m->possible_max_rank) { - pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); + pr_warn_client(cl, "got incorrect mds(%d)\n", mds); continue; } if (state <= 0) { - dout("mdsmap_decode got incorrect state(%s)\n", - ceph_mds_state_name(state)); + doutc(cl, "got incorrect state(%s)\n", + ceph_mds_state_name(state)); continue; } @@ -385,16 +386,16 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) m->m_max_xattr_size = 0; } bad_ext: - dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", - !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); + doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); *p = end; - dout("mdsmap_decode success epoch %u\n", m->m_epoch); + doutc(cl, "success epoch %u\n", m->m_epoch); return m; nomem: err = -ENOMEM; goto out_err; corrupt: - pr_err("corrupt mdsmap\n"); + pr_err_client(cl, "corrupt mdsmap\n"); print_hex_dump(KERN_DEBUG, "mdsmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h new file mode 100644 index 000000000000..89f1931f1ba6 --- /dev/null +++ b/fs/ceph/mdsmap.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_MDSMAP_H +#define _FS_CEPH_MDSMAP_H + +#include <linux/bug.h> +#include <linux/ceph/types.h> + +struct ceph_mds_client; + +/* + * mds map - describe servers in the mds cluster. + * + * we limit fields to those the client actually xcares about + */ +struct ceph_mds_info { + u64 global_id; + struct ceph_entity_addr addr; + s32 state; + int num_export_targets; + bool laggy; + u32 *export_targets; +}; + +struct ceph_mdsmap { + u32 m_epoch, m_client_epoch, m_last_failure; + u32 m_root; + u32 m_session_timeout; /* seconds */ + u32 m_session_autoclose; /* seconds */ + u64 m_max_file_size; + u64 m_max_xattr_size; /* maximum size for xattrs blob */ + u32 m_max_mds; /* expected up:active mds number */ + u32 m_num_active_mds; /* actual up:active mds number */ + u32 possible_max_rank; /* possible max rank index */ + struct ceph_mds_info *m_info; + + /* which object pools file data can be stored in */ + int m_num_data_pg_pools; + u64 *m_data_pg_pools; + u64 m_cas_pg_pool; + + bool m_enabled; + bool m_damaged; + int m_num_laggy; +}; + +static inline struct ceph_entity_addr * +ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) +{ + if (w >= m->possible_max_rank) + return NULL; + return &m->m_info[w].addr; +} + +static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) +{ + BUG_ON(w < 0); + if (w >= m->possible_max_rank) + return CEPH_MDS_STATE_DNE; + return m->m_info[w].state; +} + +static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) +{ + if (w >= 0 && w < m->possible_max_rank) + return m->m_info[w].laggy; + return false; +} + +extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); +struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, + void *end, bool msgr2); +extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); +extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m); + +#endif diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 6d3584f16f9a..871c1090e520 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -31,6 +31,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; s64 sum; s32 items = 0; @@ -51,8 +52,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { - pr_err("send metrics to mds%d, failed to allocate message\n", - s->s_mds); + pr_err_client(cl, "to mds%d, failed to allocate message\n", + s->s_mds); return false; } diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 64592adfe48f..9d36c3532de1 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -43,29 +43,28 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, { struct super_block *sb = mdsc->fsc->sb; struct ceph_mds_quota *h = msg->front.iov_base; + struct ceph_client *cl = mdsc->fsc->client; struct ceph_vino vino; struct inode *inode; struct ceph_inode_info *ci; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + if (msg->front.iov_len < sizeof(*h)) { - pr_err("%s corrupt message mds%d len %d\n", __func__, - session->s_mds, (int)msg->front.iov_len); + pr_err_client(cl, "corrupt message mds%d len %d\n", + session->s_mds, (int)msg->front.iov_len); ceph_msg_dump(msg); - return; + goto out; } - /* increment msg sequence number */ - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - /* lookup inode */ vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { - pr_warn("Failed to find inode %llu\n", vino.ino); - return; + pr_warn_client(cl, "failed to find inode %llx\n", vino.ino); + goto out; } ci = ceph_inode(inode); @@ -78,6 +77,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); iput(inode); +out: + ceph_dec_mds_stopping_blocker(mdsc); } static struct ceph_quotarealm_inode * @@ -85,6 +86,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino) { struct ceph_quotarealm_inode *qri = NULL; struct rb_node **node, *parent = NULL; + struct ceph_client *cl = mdsc->fsc->client; mutex_lock(&mdsc->quotarealms_inodes_mutex); node = &(mdsc->quotarealms_inodes.rb_node); @@ -110,7 +112,7 @@ find_quotarealm_inode(struct ceph_mds_client *mdsc, u64 ino) rb_link_node(&qri->node, parent, node); rb_insert_color(&qri->node, &mdsc->quotarealms_inodes); } else - pr_warn("Failed to alloc quotarealms_inode\n"); + pr_warn_client(cl, "Failed to alloc quotarealms_inode\n"); } mutex_unlock(&mdsc->quotarealms_inodes_mutex); @@ -129,6 +131,7 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, struct super_block *sb, struct ceph_snap_realm *realm) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_quotarealm_inode *qri; struct inode *in; @@ -161,8 +164,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, } if (IS_ERR(in)) { - dout("Can't lookup inode %llx (err: %ld)\n", - realm->ino, PTR_ERR(in)); + doutc(cl, "Can't lookup inode %llx (err: %ld)\n", realm->ino, + PTR_ERR(in)); qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ } else { qri->timeout = 0; @@ -213,6 +216,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, enum quota_get_realm which_quota, bool retry) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci = NULL; struct ceph_snap_realm *realm, *next; struct inode *in; @@ -226,8 +230,9 @@ restart: if (realm) ceph_get_snap_realm(mdsc, realm); else - pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " - "null i_snap_realm\n", ceph_vinop(inode)); + pr_err_ratelimited_client(cl, + "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); while (realm) { bool has_inode; @@ -317,6 +322,7 @@ static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, loff_t delta) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_snap_realm *realm, *next; struct inode *in; @@ -332,8 +338,9 @@ restart: if (realm) ceph_get_snap_realm(mdsc, realm); else - pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) " - "null i_snap_realm\n", ceph_vinop(inode)); + pr_err_ratelimited_client(cl, + "%p %llx.%llx null i_snap_realm\n", + inode, ceph_vinop(inode)); while (realm) { bool has_inode; @@ -383,7 +390,7 @@ restart: break; default: /* Shouldn't happen */ - pr_warn("Invalid quota check op (%d)\n", op); + pr_warn_client(cl, "Invalid quota check op (%d)\n", op); exceeded = true; /* Just break the loop */ } iput(in); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 343d738448dc..c65f2b202b2b 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -138,7 +138,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( __insert_snap_realm(&mdsc->snap_realms, realm); mdsc->num_snap_realms++; - dout("%s %llx %p\n", __func__, realm->ino, realm); + doutc(mdsc->fsc->client, "%llx %p\n", realm->ino, realm); return realm; } @@ -150,6 +150,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino) { + struct ceph_client *cl = mdsc->fsc->client; struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; @@ -162,7 +163,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, else if (ino > r->ino) n = n->rb_right; else { - dout("%s %llx %p\n", __func__, r->ino, r); + doutc(cl, "%llx %p\n", r->ino, r); return r; } } @@ -188,9 +189,10 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc, static void __destroy_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { + struct ceph_client *cl = mdsc->fsc->client; lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("%s %p %llx\n", __func__, realm, realm->ino); + doutc(cl, "%p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); mdsc->num_snap_realms--; @@ -290,6 +292,7 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm, u64 parentino) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent; lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -303,8 +306,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, if (IS_ERR(parent)) return PTR_ERR(parent); } - dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, - realm, realm->parent_ino, realm->parent, parentino, parent); + doutc(cl, "%llx %p: %llx %p -> %llx %p\n", realm->ino, realm, + realm->parent_ino, realm->parent, parentino, parent); if (realm->parent) { list_del_init(&realm->child_item); ceph_put_snap_realm(mdsc, realm->parent); @@ -329,10 +332,12 @@ static int cmpu64_rev(const void *a, const void *b) /* * build the snap context for a given realm. */ -static int build_snap_context(struct ceph_snap_realm *realm, +static int build_snap_context(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, struct list_head *realm_queue, struct list_head *dirty_realms) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0; @@ -360,10 +365,10 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", - __func__, realm->ino, realm, realm->cached_context, - realm->cached_context->seq, - (unsigned int)realm->cached_context->num_snaps); + doutc(cl, "%llx %p: %p seq %lld (%u snaps) (unchanged)\n", + realm->ino, realm, realm->cached_context, + realm->cached_context->seq, + (unsigned int)realm->cached_context->num_snaps); return 0; } @@ -400,8 +405,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, - realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); + doutc(cl, "%llx %p: %p seq %lld (%u snaps)\n", realm->ino, realm, + snapc, snapc->seq, (unsigned int) snapc->num_snaps); ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; @@ -418,16 +423,18 @@ fail: ceph_put_snap_context(realm->cached_context); realm->cached_context = NULL; } - pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); + pr_err_client(cl, "%llx %p fail %d\n", realm->ino, realm, err); return err; } /* * rebuild snap context for the given realm and all of its children. */ -static void rebuild_snap_realms(struct ceph_snap_realm *realm, +static void rebuild_snap_realms(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm, struct list_head *dirty_realms) { + struct ceph_client *cl = mdsc->fsc->client; LIST_HEAD(realm_queue); int last = 0; bool skip = false; @@ -451,9 +458,10 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm, continue; } - last = build_snap_context(_realm, &realm_queue, dirty_realms); - dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, - last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); + last = build_snap_context(mdsc, _realm, &realm_queue, + dirty_realms); + doutc(cl, "%llx %p, %s\n", realm->ino, realm, + last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); /* is any child in the list ? */ list_for_each_entry(child, &_realm->children, child_item) { @@ -523,6 +531,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap **pcapsnap) { struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_snap_context *old_snapc, *new_snapc; struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; @@ -548,14 +557,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("%s %p %llx.%llx already pending\n", - __func__, inode, ceph_vinop(inode)); + doutc(cl, "%p %llx.%llx already pending\n", inode, + ceph_vinop(inode)); goto update_snapc; } if (ci->i_wrbuffer_ref_head == 0 && !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { - dout("%s %p %llx.%llx nothing dirty|writing\n", - __func__, inode, ceph_vinop(inode)); + doutc(cl, "%p %llx.%llx nothing dirty|writing\n", inode, + ceph_vinop(inode)); goto update_snapc; } @@ -575,15 +584,15 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, } else { if (!(used & CEPH_CAP_FILE_WR) && ci->i_wrbuffer_ref_head == 0) { - dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", - __func__, inode, ceph_vinop(inode)); + doutc(cl, "%p %llx.%llx no new_snap|dirty_page|writing\n", + inode, ceph_vinop(inode)); goto update_snapc; } } - dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", - __func__, inode, ceph_vinop(inode), capsnap, old_snapc, - ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); + doutc(cl, "%p %llx.%llx cap_snap %p queuing under %p %s %s\n", + inode, ceph_vinop(inode), capsnap, old_snapc, + ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); capsnap->follows = old_snapc->seq; @@ -615,9 +624,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { - dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," - " now pending\n", __func__, inode, ceph_vinop(inode), - capsnap, old_snapc, old_snapc->seq); + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p seq %llu used WR," + " now pending\n", inode, ceph_vinop(inode), capsnap, + old_snapc, old_snapc->seq); capsnap->writing = 1; } else { /* note mtime, size NOW. */ @@ -634,7 +643,7 @@ update_snapc: ci->i_head_snapc = NULL; } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); - dout(" new snapc is %p\n", new_snapc); + doutc(cl, " new snapc is %p\n", new_snapc); } spin_unlock(&ci->i_ceph_lock); @@ -655,23 +664,25 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_client *cl = mdsc->fsc->client; BUG_ON(capsnap->writing); capsnap->size = i_size_read(inode); - capsnap->mtime = inode->i_mtime; - capsnap->atime = inode->i_atime; - capsnap->ctime = inode->i_ctime; + capsnap->mtime = inode_get_mtime(inode); + capsnap->atime = inode_get_atime(inode); + capsnap->ctime = inode_get_ctime(inode); capsnap->btime = ci->i_btime; capsnap->change_attr = inode_peek_iversion_raw(inode); capsnap->time_warp_seq = ci->i_time_warp_seq; capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { - dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " - "still has %d dirty pages\n", __func__, inode, - ceph_vinop(inode), capsnap, capsnap->context, - capsnap->context->seq, ceph_cap_string(capsnap->dirty), - capsnap->size, capsnap->dirty_pages); + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s " + "s=%llu still has %d dirty pages\n", inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, + ceph_cap_string(capsnap->dirty), + capsnap->size, capsnap->dirty_pages); return 0; } @@ -680,20 +691,20 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, * And trigger to flush the buffer immediately. */ if (ci->i_wrbuffer_ref) { - dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " - "used WRBUFFER, delaying\n", __func__, inode, - ceph_vinop(inode), capsnap, capsnap->context, - capsnap->context->seq, ceph_cap_string(capsnap->dirty), - capsnap->size); + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s " + "s=%llu used WRBUFFER, delaying\n", inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); ceph_queue_writeback(inode); return 0; } ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; - dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", - __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, - capsnap->context->seq, ceph_cap_string(capsnap->dirty), - capsnap->size); + doutc(cl, "%p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", + inode, ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); spin_lock(&mdsc->snap_flush_lock); if (list_empty(&ci->i_snap_flush_item)) { @@ -708,13 +719,15 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, * Queue cap_snaps for snap writeback for this realm and its children. * Called under snap_rwsem, so realm topology won't change. */ -static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +static void queue_realm_cap_snaps(struct ceph_mds_client *mdsc, + struct ceph_snap_realm *realm) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *lastinode = NULL; struct ceph_cap_snap *capsnap = NULL; - dout("%s %p %llx inode\n", __func__, realm, realm->ino); + doutc(cl, "%p %llx inode\n", realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { @@ -733,8 +746,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (!capsnap) { capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); if (!capsnap) { - pr_err("ENOMEM allocating ceph_cap_snap on %p\n", - inode); + pr_err_client(cl, + "ENOMEM allocating ceph_cap_snap on %p\n", + inode); return; } } @@ -752,7 +766,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (capsnap) kmem_cache_free(ceph_cap_snap_cachep, capsnap); - dout("%s %p %llx done\n", __func__, realm, realm->ino); + doutc(cl, "%p %llx done\n", realm, realm->ino); } /* @@ -766,6 +780,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ @@ -780,7 +795,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("%s deletion=%d\n", __func__, deletion); + doutc(cl, "deletion=%d\n", deletion); more: realm = NULL; rebuild_snapcs = 0; @@ -810,8 +825,8 @@ more: rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { - dout("%s updating %llx %p %lld -> %lld\n", __func__, - realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); + doutc(cl, "updating %llx %p %lld -> %lld\n", realm->ino, + realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); realm->created = le64_to_cpu(ri->created); @@ -834,16 +849,16 @@ more: rebuild_snapcs = 1; } else if (!realm->cached_context) { - dout("%s %llx %p seq %lld new\n", __func__, - realm->ino, realm, realm->seq); + doutc(cl, "%llx %p seq %lld new\n", realm->ino, realm, + realm->seq); rebuild_snapcs = 1; } else { - dout("%s %llx %p seq %lld unchanged\n", __func__, - realm->ino, realm, realm->seq); + doutc(cl, "%llx %p seq %lld unchanged\n", realm->ino, realm, + realm->seq); } - dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, - realm, rebuild_snapcs, p, e); + doutc(cl, "done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, + realm, rebuild_snapcs, p, e); /* * this will always track the uppest parent realm from which @@ -855,7 +870,7 @@ more: /* rebuild_snapcs when we reach the _end_ (root) of the trace */ if (realm_to_rebuild && p >= e) - rebuild_snap_realms(realm_to_rebuild, &dirty_realms); + rebuild_snap_realms(mdsc, realm_to_rebuild, &dirty_realms); if (!first_realm) first_realm = realm; @@ -873,7 +888,7 @@ more: realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, dirty_item); list_del_init(&realm->dirty_item); - queue_realm_cap_snaps(realm); + queue_realm_cap_snaps(mdsc, realm); } if (realm_ret) @@ -891,7 +906,7 @@ fail: ceph_put_snap_realm(mdsc, realm); if (first_realm) ceph_put_snap_realm(mdsc, first_realm); - pr_err("%s error %d\n", __func__, err); + pr_err_client(cl, "error %d\n", err); /* * When receiving a corrupted snap trace we don't know what @@ -905,11 +920,12 @@ fail: WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); if (ret) - pr_err("%s failed to blocklist %s: %d\n", __func__, - ceph_pr_addr(&client->msgr.inst.addr), ret); + pr_err_client(cl, "failed to blocklist %s: %d\n", + ceph_pr_addr(&client->msgr.inst.addr), ret); - WARN(1, "%s: %s%sdo remount to continue%s", - __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), + WARN(1, "[client.%lld] %s %s%sdo remount to continue%s", + client->monc.auth->global_id, __func__, + ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), ret ? "" : " was blocklisted, ", err == -EIO ? " after corrupted snaptrace is fixed" : ""); @@ -925,11 +941,12 @@ fail: */ static void flush_snaps(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; struct ceph_mds_session *session = NULL; - dout("%s\n", __func__); + doutc(cl, "begin\n"); spin_lock(&mdsc->snap_flush_lock); while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, @@ -944,7 +961,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_flush_lock); ceph_put_mds_session(session); - dout("%s done\n", __func__); + doutc(cl, "done\n"); } /** @@ -960,7 +977,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_snap_realm *oldrealm = ci->i_snap_realm; lockdep_assert_held(&ci->i_ceph_lock); @@ -1000,6 +1017,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { + struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds; u64 split; @@ -1015,6 +1033,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, int locked_rwsem = 0; bool close_sessions = false; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -1027,12 +1048,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, trace_len = le32_to_cpu(h->trace_len); p += sizeof(*h); - dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, - mds, ceph_snap_op_name(op), split, trace_len); - - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); + doutc(cl, "from mds%d op %s split %llx tracelen %d\n", mds, + ceph_snap_op_name(op), split, trace_len); down_write(&mdsc->snap_rwsem); locked_rwsem = 1; @@ -1063,7 +1080,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, goto out; } - dout("splitting snap_realm %llx %p\n", realm->ino, realm); + doutc(cl, "splitting snap_realm %llx %p\n", realm->ino, realm); for (i = 0; i < num_split_inos; i++) { struct ceph_vino vino = { .ino = le64_to_cpu(split_inos[i]), @@ -1088,13 +1105,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, */ if (ci->i_snap_realm->created > le64_to_cpu(ri->created)) { - dout(" leaving %p %llx.%llx in newer realm %llx %p\n", - inode, ceph_vinop(inode), ci->i_snap_realm->ino, - ci->i_snap_realm); + doutc(cl, " leaving %p %llx.%llx in newer realm %llx %p\n", + inode, ceph_vinop(inode), ci->i_snap_realm->ino, + ci->i_snap_realm); goto skip_inode; } - dout(" will move %p %llx.%llx to split realm %llx %p\n", - inode, ceph_vinop(inode), realm->ino, realm); + doutc(cl, " will move %p %llx.%llx to split realm %llx %p\n", + inode, ceph_vinop(inode), realm->ino, realm); ceph_get_snap_realm(mdsc, realm); ceph_change_snap_realm(inode, realm); @@ -1151,15 +1168,18 @@ skip_inode: up_write(&mdsc->snap_rwsem); flush_snaps(mdsc); + ceph_dec_mds_stopping_blocker(mdsc); return; bad: - pr_err("%s corrupt snap message from mds%d\n", __func__, mds); + pr_err_client(cl, "corrupt snap message from mds%d\n", mds); ceph_msg_dump(msg); out: if (locked_rwsem) up_write(&mdsc->snap_rwsem); + ceph_dec_mds_stopping_blocker(mdsc); + if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; @@ -1168,6 +1188,7 @@ out: struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_snapid_map *sm, *exist; struct rb_node **p, *parent; int ret; @@ -1190,8 +1211,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, } spin_unlock(&mdsc->snapid_map_lock); if (exist) { - dout("%s found snapid map %llx -> %x\n", __func__, - exist->snap, exist->dev); + doutc(cl, "found snapid map %llx -> %x\n", exist->snap, + exist->dev); return exist; } @@ -1235,13 +1256,12 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, if (exist) { free_anon_bdev(sm->dev); kfree(sm); - dout("%s found snapid map %llx -> %x\n", __func__, - exist->snap, exist->dev); + doutc(cl, "found snapid map %llx -> %x\n", exist->snap, + exist->dev); return exist; } - dout("%s create snapid map %llx -> %x\n", __func__, - sm->snap, sm->dev); + doutc(cl, "create snapid map %llx -> %x\n", sm->snap, sm->dev); return sm; } @@ -1266,6 +1286,7 @@ void ceph_put_snapid_map(struct ceph_mds_client* mdsc, void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_snapid_map *sm; unsigned long now; LIST_HEAD(to_free); @@ -1287,7 +1308,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) while (!list_empty(&to_free)) { sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); list_del(&sm->lru); - dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); + doutc(cl, "trim snapid map %llx -> %x\n", sm->snap, sm->dev); free_anon_bdev(sm->dev); kfree(sm); } @@ -1295,6 +1316,7 @@ void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) { + struct ceph_client *cl = mdsc->fsc->client; struct ceph_snapid_map *sm; struct rb_node *p; LIST_HEAD(to_free); @@ -1313,8 +1335,8 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) list_del(&sm->lru); free_anon_bdev(sm->dev); if (WARN_ON_ONCE(atomic_read(&sm->ref))) { - pr_err("snapid map %llx -> %x still in use\n", - sm->snap, sm->dev); + pr_err_client(cl, "snapid map %llx -> %x still in use\n", + sm->snap, sm->dev); } kfree(sm); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a5f52013314d..5ec102f6b1ac 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -20,6 +20,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> @@ -43,27 +44,29 @@ static LIST_HEAD(ceph_fsc_list); */ static void ceph_put_super(struct super_block *s) { - struct ceph_fs_client *fsc = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s); - dout("put_super\n"); + doutc(fsc->client, "begin\n"); + ceph_fscrypt_free_dummy_policy(fsc); ceph_mdsc_close_sessions(fsc->mdsc); + doutc(fsc->client, "done\n"); } static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(d_inode(dentry)); struct ceph_mon_client *monc = &fsc->client->monc; struct ceph_statfs st; int i, err; u64 data_pool; + doutc(fsc->client, "begin\n"); if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; } else { data_pool = CEPH_NOPOOL; } - dout("statfs\n"); err = ceph_monc_do_statfs(monc, data_pool, &st); if (err < 0) return err; @@ -111,24 +114,26 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) /* fold the fs_cluster_id into the upper bits */ buf->f_fsid.val[1] = monc->fs_cluster_id; + doutc(fsc->client, "done\n"); return 0; } static int ceph_sync_fs(struct super_block *sb, int wait) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_client *cl = fsc->client; if (!wait) { - dout("sync_fs (non-blocking)\n"); + doutc(cl, "(non-blocking)\n"); ceph_flush_dirty_caps(fsc->mdsc); - dout("sync_fs (non-blocking) done\n"); + doutc(cl, "(non-blocking) done\n"); return 0; } - dout("sync_fs (blocking)\n"); + doutc(cl, "(blocking)\n"); ceph_osdc_sync(&fsc->client->osdc); ceph_mdsc_sync(fsc->mdsc); - dout("sync_fs (blocking) done\n"); + doutc(cl, "(blocking) done\n"); return 0; } @@ -151,6 +156,7 @@ enum { Opt_recover_session, Opt_source, Opt_mon_addr, + Opt_test_dummy_encryption, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -165,6 +171,7 @@ enum { Opt_copyfrom, Opt_wsync, Opt_pagecache, + Opt_sparseread, }; enum ceph_recover_session_mode { @@ -192,6 +199,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("fsc", Opt_fscache), // fsc=... fsparam_flag_no ("ino32", Opt_ino32), fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_flag_no ("poolperm", Opt_poolperm), fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), @@ -203,10 +211,12 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), - fsparam_string ("mon_addr", Opt_mon_addr), + fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), fsparam_flag_no ("pagecache", Opt_pagecache), + fsparam_flag_no ("sparseread", Opt_sparseread), {} }; @@ -334,7 +344,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) char *dev_name = param->string, *dev_name_end; int ret; - dout("%s '%s'\n", __func__, dev_name); + dout("'%s'\n", dev_name); if (!dev_name || !*dev_name) return invalfc(fc, "Empty source"); @@ -406,7 +416,7 @@ static int ceph_parse_mount_param(struct fs_context *fc, return ret; token = fs_parse(fc, ceph_mount_parameters, param, &result); - dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); + dout("%s: fs_parse '%s' token %d\n",__func__, param->key, token); if (token < 0) return token; @@ -576,6 +586,29 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; break; + case Opt_sparseread: + if (result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD; + else + fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD; + break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_FS_ENCRYPTION + fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy); + ret = fscrypt_parse_test_dummy_encryption(param, + &fsopt->dummy_enc_policy); + if (ret == -EINVAL) { + warnfc(fc, "Value of option \"%s\" is unrecognized", + param->key); + } else if (ret == -EEXIST) { + warnfc(fc, "Conflicting test_dummy_encryption options"); + ret = -EINVAL; + } +#else + warnfc(fc, + "FS encryption not supported: test_dummy_encryption mount option ignored"); +#endif + break; default: BUG(); } @@ -596,6 +629,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->server_path); kfree(args->fscache_uniq); kfree(args->mon_addr); + fscrypt_free_dummy_policy(&args->dummy_enc_policy); kfree(args); } @@ -653,7 +687,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, */ static int ceph_show_options(struct seq_file *m, struct dentry *root) { - struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; size_t pos; int ret; @@ -710,9 +744,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); - if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) seq_puts(m, ",nopagecache"); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + seq_puts(m, ",sparseread"); + + fscrypt_show_test_dummy_encryption(m, ',', root->d_sb); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); @@ -847,7 +884,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc) static void destroy_fs_client(struct ceph_fs_client *fsc) { - dout("destroy_fs_client %p\n", fsc); + doutc(fsc->client, "%p\n", fsc); spin_lock(&ceph_fsc_lock); list_del(&fsc->metric_wakeup); @@ -862,7 +899,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) ceph_destroy_client(fsc->client); kfree(fsc); - dout("destroy_fs_client %p done\n", fsc); + dout("%s: %p done\n", __func__, fsc); } /* @@ -981,9 +1018,9 @@ static void __ceph_umount_begin(struct ceph_fs_client *fsc) */ void ceph_umount_begin(struct super_block *sb) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); - dout("ceph_umount_begin - starting forced umount\n"); + doutc(fsc->client, "starting forced umount\n"); if (!fsc) return; fsc->mount_state = CEPH_MOUNT_SHUTDOWN; @@ -1011,13 +1048,14 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, const char *path, unsigned long started) { + struct ceph_client *cl = fsc->client; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req = NULL; int err; struct dentry *root; /* open dir */ - dout("open_root_inode opening '%s'\n", path); + doutc(cl, "opening '%s'\n", path); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return ERR_CAST(req); @@ -1037,13 +1075,13 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (err == 0) { struct inode *inode = req->r_target_inode; req->r_target_inode = NULL; - dout("open_root_inode success\n"); + doutc(cl, "success\n"); root = d_make_root(inode); if (!root) { root = ERR_PTR(-ENOMEM); goto out; } - dout("open_root_inode success, root dentry is %p\n", root); + doutc(cl, "success, root dentry is %p\n", root); } else { root = ERR_PTR(err); } @@ -1052,17 +1090,62 @@ out: return root; } +#ifdef CONFIG_FS_ENCRYPTION +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + struct ceph_fs_client *fsc = sb->s_fs_info; + + if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy)) + return 0; + + /* No changing encryption context on remount. */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + + /* Also make sure fsopt doesn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + + fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy; + memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy)); + + warnfc(fc, "test_dummy_encryption mode enabled"); + return 0; +} +#else +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + return 0; +} +#endif + /* * mount: join the ceph cluster, and open root directory. */ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, struct fs_context *fc) { + struct ceph_client *cl = fsc->client; int err; unsigned long started = jiffies; /* note the start time */ struct dentry *root; - dout("mount start %p\n", fsc); + doutc(cl, "mount start %p\n", fsc); mutex_lock(&fsc->client->mount_mutex); if (!fsc->sb->s_root) { @@ -1080,7 +1163,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto out; } - dout("mount opening path '%s'\n", path); + err = ceph_apply_test_dummy_encryption(fsc->sb, fc, + fsc->mount_options); + if (err) + goto out; + + doutc(cl, "mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); @@ -1095,21 +1183,23 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, } fsc->mount_state = CEPH_MOUNT_MOUNTED; - dout("mount success\n"); + doutc(cl, "mount success\n"); mutex_unlock(&fsc->client->mount_mutex); return root; out: mutex_unlock(&fsc->client->mount_mutex); + ceph_fscrypt_free_dummy_policy(fsc); return ERR_PTR(err); } static int ceph_set_super(struct super_block *s, struct fs_context *fc) { struct ceph_fs_client *fsc = s->s_fs_info; + struct ceph_client *cl = fsc->client; int ret; - dout("set_super %p\n", s); + doutc(cl, "%p\n", s); s->s_maxbytes = MAX_LFS_FILESIZE; @@ -1126,6 +1216,8 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_max = U32_MAX; s->s_flags |= SB_NODIRATIME | SB_NOATIME; + ceph_fscrypt_set_ops(s); + ret = set_anon_super_fc(s, fc); if (ret != 0) fsc->sb = NULL; @@ -1140,31 +1232,32 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + struct ceph_client *cl = fsc->client; - dout("ceph_compare_super %p\n", sb); + doutc(cl, "%p\n", sb); if (compare_mount_options(fsopt, opt, fsc)) { - dout("monitor(s)/mount options don't match\n"); + doutc(cl, "monitor(s)/mount options don't match\n"); return 0; } if ((opt->flags & CEPH_OPT_FSID) && ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) { - dout("fsid doesn't match\n"); + doutc(cl, "fsid doesn't match\n"); return 0; } if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) { - dout("flags differ\n"); + doutc(cl, "flags differ\n"); return 0; } if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) { - dout("client is blocklisted (and CLEANRECOVER is not set)\n"); + doutc(cl, "client is blocklisted (and CLEANRECOVER is not set)\n"); return 0; } if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { - dout("client has been forcibly unmounted\n"); + doutc(cl, "client has been forcibly unmounted\n"); return 0; } @@ -1236,9 +1329,9 @@ static int ceph_get_tree(struct fs_context *fc) goto out; } - if (ceph_sb_to_client(sb) != fsc) { + if (ceph_sb_to_fs_client(sb) != fsc) { destroy_fs_client(fsc); - fsc = ceph_sb_to_client(sb); + fsc = ceph_sb_to_fs_client(sb); dout("get_sb got existing client %p\n", fsc); } else { dout("get_sb using new client %p\n", fsc); @@ -1252,8 +1345,9 @@ static int ceph_get_tree(struct fs_context *fc) err = PTR_ERR(res); goto out_splat; } - dout("root %p inode %p ino %llx.%llx\n", res, - d_inode(res), ceph_vinop(d_inode(res))); + + doutc(fsc->client, "root %p inode %p ino %llx.%llx\n", res, + d_inode(res), ceph_vinop(d_inode(res))); fc->root = fsc->sb->s_root; return 0; @@ -1287,23 +1381,35 @@ static void ceph_free_fc(struct fs_context *fc) static int ceph_reconfigure_fc(struct fs_context *fc) { + int err; struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; - struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); + struct super_block *sb = fc->root->d_sb; + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); + + err = ceph_apply_test_dummy_encryption(sb, fc, fsopt); + if (err) + return err; if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) ceph_set_mount_opt(fsc, ASYNC_DIROPS); else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + ceph_set_mount_opt(fsc, SPARSEREAD); + else + ceph_clear_mount_opt(fsc, SPARSEREAD); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { kfree(fsc->mount_options->mon_addr); fsc->mount_options->mon_addr = fsopt->mon_addr; fsopt->mon_addr = NULL; - pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + pr_notice_client(fsc->client, + "monitor addresses recorded, but not used for reconnection"); } - sync_filesystem(fc->root->d_sb); + sync_filesystem(sb); return 0; } @@ -1365,25 +1471,102 @@ nomem: return -ENOMEM; } +/* + * Return true if it successfully increases the blocker counter, + * or false if the mdsc is in stopping and flushed state. + */ +static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) { + spin_unlock(&mdsc->stopping_lock); + return false; + } + atomic_inc(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + return true; +} + +static void __dec_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (!atomic_dec_return(&mdsc->stopping_blockers) && + mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) + complete_all(&mdsc->stopping_waiter); + spin_unlock(&mdsc->stopping_lock); +} + +/* For metadata IO requests */ +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + +/* For data IO requests */ +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + static void ceph_kill_sb(struct super_block *s) { - struct ceph_fs_client *fsc = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(s); + struct ceph_client *cl = fsc->client; + struct ceph_mds_client *mdsc = fsc->mdsc; + bool wait; - dout("kill_sb %p\n", s); + doutc(cl, "%p\n", s); - ceph_mdsc_pre_umount(fsc->mdsc); + ceph_mdsc_pre_umount(mdsc); flush_fs_workqueues(fsc); /* * Though the kill_anon_super() will finally trigger the - * sync_filesystem() anyway, we still need to do it here - * and then bump the stage of shutdown to stop the work - * queue as earlier as possible. + * sync_filesystem() anyway, we still need to do it here and + * then bump the stage of shutdown. This will allow us to + * drop any further message, which will increase the inodes' + * i_count reference counters but makes no sense any more, + * from MDSs. + * + * Without this when evicting the inodes it may fail in the + * kill_anon_super(), which will trigger a warning when + * destroying the fscrypt keyring and then possibly trigger + * a further crash in ceph module when the iput() tries to + * evict the inodes later. */ sync_filesystem(s); - fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + spin_lock(&mdsc->stopping_lock); + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; + wait = !!atomic_read(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + + if (wait && atomic_read(&mdsc->stopping_blockers)) { + long timeleft = wait_for_completion_killable_timeout( + &mdsc->stopping_waiter, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); + } + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; @@ -1399,13 +1582,13 @@ static struct file_system_type ceph_fs_type = { .name = "ceph", .init_fs_context = ceph_init_fs_context, .kill_sb = ceph_kill_sb, - .fs_flags = FS_RENAME_DOES_D_MOVE, + .fs_flags = FS_RENAME_DOES_D_MOVE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ceph"); int ceph_force_reconnect(struct super_block *sb) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb); int err = 0; fsc->mount_state = CEPH_MOUNT_RECOVER; @@ -1498,6 +1681,11 @@ static const struct kernel_param_ops param_ops_mount_syntax = { module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); +bool enable_unsafe_idmap = false; +module_param(enable_unsafe_idmap, bool, 0644); +MODULE_PARM_DESC(enable_unsafe_idmap, + "Allow to use idmapped mounts with MDS without CEPHFS_FEATURE_HAS_OWNER_UIDGID"); + module_init(init_ceph); module_exit(exit_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3bfddf34d488..fe0f64a0acb2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -22,6 +22,7 @@ #include <linux/hashtable.h> #include <linux/ceph/libceph.h> +#include "crypto.h" /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ @@ -42,6 +43,7 @@ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ #define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ +#define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ @@ -98,6 +100,7 @@ struct ceph_mount_options { char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ char *mon_addr; + struct fscrypt_dummy_policy dummy_enc_policy; }; /* mount state */ @@ -154,9 +157,11 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_volume *fscache; #endif +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_dummy_policy fsc_dummy_enc_policy; +#endif }; - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -419,6 +424,11 @@ struct ceph_inode_info { u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ + /* + * For none fscrypt case it equals to i_truncate_size or it will + * equals to fscrypt_file_size + */ + u64 i_truncate_pagecache_size; u64 i_max_size; /* max file size authorized by mds */ u64 i_reported_size; /* (max_)size reported to or requested of mds */ @@ -449,6 +459,13 @@ struct ceph_inode_info { struct work_struct i_work; unsigned long i_work_mask; + +#ifdef CONFIG_FS_ENCRYPTION + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u8 *fscrypt_auth; + u8 *fscrypt_file; +#endif }; struct ceph_netfs_request_data { @@ -471,13 +488,13 @@ ceph_inode(const struct inode *inode) } static inline struct ceph_fs_client * -ceph_inode_to_client(const struct inode *inode) +ceph_inode_to_fs_client(const struct inode *inode) { return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } static inline struct ceph_fs_client * -ceph_sb_to_client(const struct super_block *sb) +ceph_sb_to_fs_client(const struct super_block *sb) { return (struct ceph_fs_client *)sb->s_fs_info; } @@ -485,7 +502,13 @@ ceph_sb_to_client(const struct super_block *sb) static inline struct ceph_mds_client * ceph_sb_to_mdsc(const struct super_block *sb) { - return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc; + return (struct ceph_mds_client *)ceph_sb_to_fs_client(sb)->mdsc; +} + +static inline struct ceph_client * +ceph_inode_to_client(const struct inode *inode) +{ + return (struct ceph_client *)ceph_inode_to_fs_client(inode)->client; } static inline struct ceph_vino @@ -541,7 +564,7 @@ static inline u64 ceph_snap(struct inode *inode) */ static inline u64 ceph_present_ino(struct super_block *sb, u64 ino) { - if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))) + if (unlikely(ceph_test_mount_opt(ceph_sb_to_fs_client(sb), INO32))) return ceph_ino_to_ino32(ino); return ino; } @@ -998,6 +1021,7 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) /* inode.c */ struct ceph_mds_reply_info_in; struct ceph_mds_reply_dirfrag; +struct ceph_acl_sec_ctx; extern const struct inode_operations ceph_file_iops; @@ -1005,8 +1029,14 @@ extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_evict_inode(struct inode *inode); extern void ceph_free_inode(struct inode *inode); +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx); + extern struct inode *ceph_get_inode(struct super_block *sb, - struct ceph_vino vino); + struct ceph_vino vino, + struct inode *newino); extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); @@ -1065,7 +1095,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) } extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -extern int __ceph_setattr(struct inode *inode, struct iattr *attr); + +struct ceph_iattr { + struct ceph_fscrypt_auth *fscrypt_auth; +}; + +extern int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode, + struct iattr *attr, struct ceph_iattr *cia); extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct mnt_idmap *idmap, @@ -1076,7 +1112,7 @@ void ceph_inode_shutdown(struct inode *inode); static inline bool ceph_inode_is_shutdown(struct inode *inode) { unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int state = READ_ONCE(fsc->mount_state); return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN; @@ -1089,7 +1125,7 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); -extern const struct xattr_handler *ceph_xattr_handlers[]; +extern const struct xattr_handler * const ceph_xattr_handlers[]; struct ceph_acl_sec_ctx { #ifdef CONFIG_CEPH_FS_POSIX_ACL @@ -1100,6 +1136,9 @@ struct ceph_acl_sec_ctx { void *sec_ctx; u32 sec_ctxlen; #endif +#ifdef CONFIG_FS_ENCRYPTION + struct ceph_fscrypt_auth *fscrypt_auth; +#endif struct ceph_pagelist *pagelist; }; @@ -1190,7 +1229,8 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); -extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + bool queue_release); extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); @@ -1237,6 +1277,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); +extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, + int need, int want, loff_t endoff, int *got); extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, @@ -1272,6 +1314,9 @@ extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode); +extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver); extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); @@ -1375,4 +1420,9 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc); +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 806183959c47..e066a556eccb 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,7 +57,8 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); + struct ceph_client *cl = fsc->client; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_string *pool_ns; s64 pool = ci->i_layout.pool_id; @@ -69,7 +70,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); - dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode); + doutc(cl, "%p\n", &ci->netfs.inode); down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { @@ -161,7 +162,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, char *val, size_t size) { ssize_t ret; - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; s64 pool = ci->i_layout.pool_id; const char *pool_name; @@ -313,7 +314,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); } @@ -321,7 +322,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, char *val, size_t size) { - struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); return ceph_fmt_xattr(val, size, "client%lld", ceph_client_gid(fsc->client)); @@ -352,6 +353,24 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, return ret; } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static bool ceph_vxattrcb_fscrypt_auth_exists(struct ceph_inode_info *ci) +{ + return ci->fscrypt_auth_len; +} + +static ssize_t ceph_vxattrcb_fscrypt_auth(struct ceph_inode_info *ci, + char *val, size_t size) +{ + if (size) { + if (size < ci->fscrypt_auth_len) + return -ERANGE; + memcpy(val, ci->fscrypt_auth, ci->fscrypt_auth_len); + } + return ci->fscrypt_auth_len; +} +#endif /* CONFIG_FS_ENCRYPTION */ + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -500,6 +519,15 @@ static struct ceph_vxattr ceph_common_vxattrs[] = { .exists_cb = NULL, .flags = VXATTR_FLAG_READONLY, }, +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + { + .name = "ceph.fscrypt.auth", + .name_size = sizeof("ceph.fscrypt.auth"), + .getxattr_cb = ceph_vxattrcb_fscrypt_auth, + .exists_cb = ceph_vxattrcb_fscrypt_auth_exists, + .flags = VXATTR_FLAG_READONLY, + }, +#endif /* CONFIG_FS_ENCRYPTION */ { .name = NULL, 0 } /* Required table terminator */ }; @@ -543,6 +571,8 @@ static int __set_xattr(struct ceph_inode_info *ci, int flags, int update_xattr, struct ceph_inode_xattr **newxattr) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_xattr *xattr = NULL; @@ -599,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->should_free_name = update_xattr; ci->i_xattrs.count++; - dout("%s count=%d\n", __func__, ci->i_xattrs.count); + doutc(cl, "count=%d\n", ci->i_xattrs.count); } else { kfree(*newxattr); *newxattr = NULL; @@ -627,13 +657,13 @@ static int __set_xattr(struct ceph_inode_info *ci, if (new) { rb_link_node(&xattr->node, parent, p); rb_insert_color(&xattr->node, &ci->i_xattrs.index); - dout("%s p=%p\n", __func__, p); + doutc(cl, "p=%p\n", p); } - dout("%s added %llx.%llx xattr %p %.*s=%.*s%s\n", __func__, - ceph_vinop(&ci->netfs.inode), xattr, name_len, name, - min(val_len, MAX_XATTR_VAL_PRINT_LEN), val, - val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : ""); + doutc(cl, "added %p %llx.%llx xattr %p %.*s=%.*s%s\n", inode, + ceph_vinop(inode), xattr, name_len, name, min(val_len, + MAX_XATTR_VAL_PRINT_LEN), val, + val_len > MAX_XATTR_VAL_PRINT_LEN ? "..." : ""); return 0; } @@ -641,6 +671,7 @@ static int __set_xattr(struct ceph_inode_info *ci, static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, const char *name) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_xattr *xattr = NULL; @@ -661,13 +692,13 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, else { int len = min(xattr->val_len, MAX_XATTR_VAL_PRINT_LEN); - dout("%s %s: found %.*s%s\n", __func__, name, len, - xattr->val, xattr->val_len > len ? "..." : ""); + doutc(cl, "%s found %.*s%s\n", name, len, xattr->val, + xattr->val_len > len ? "..." : ""); return xattr; } } - dout("%s %s: not found\n", __func__, name); + doutc(cl, "%s not found\n", name); return NULL; } @@ -708,19 +739,20 @@ static int __remove_xattr(struct ceph_inode_info *ci, static char *__copy_xattr_names(struct ceph_inode_info *ci, char *dest) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); struct rb_node *p; struct ceph_inode_xattr *xattr = NULL; p = rb_first(&ci->i_xattrs.index); - dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count); + doutc(cl, "count=%d\n", ci->i_xattrs.count); while (p) { xattr = rb_entry(p, struct ceph_inode_xattr, node); memcpy(dest, xattr->name, xattr->name_len); dest[xattr->name_len] = '\0'; - dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, - xattr->name_len, ci->i_xattrs.names_size); + doutc(cl, "dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name, + xattr->name_len, ci->i_xattrs.names_size); dest += xattr->name_len + 1; p = rb_next(p); @@ -731,19 +763,19 @@ static char *__copy_xattr_names(struct ceph_inode_info *ci, void __ceph_destroy_xattrs(struct ceph_inode_info *ci) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); struct rb_node *p, *tmp; struct ceph_inode_xattr *xattr = NULL; p = rb_first(&ci->i_xattrs.index); - dout("__ceph_destroy_xattrs p=%p\n", p); + doutc(cl, "p=%p\n", p); while (p) { xattr = rb_entry(p, struct ceph_inode_xattr, node); tmp = p; p = rb_next(tmp); - dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p, - xattr->name_len, xattr->name); + doutc(cl, "next p=%p (%.*s)\n", p, xattr->name_len, xattr->name); rb_erase(tmp, &ci->i_xattrs.index); __free_xattr(xattr); @@ -760,6 +792,7 @@ static int __build_xattrs(struct inode *inode) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { + struct ceph_client *cl = ceph_inode_to_client(inode); u32 namelen; u32 numattr = 0; void *p, *end; @@ -771,8 +804,8 @@ static int __build_xattrs(struct inode *inode) int err = 0; int i; - dout("__build_xattrs() len=%d\n", - ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); + doutc(cl, "len=%d\n", + ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0); if (ci->i_xattrs.index_version >= ci->i_xattrs.version) return 0; /* already built */ @@ -847,6 +880,8 @@ bad: static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, int val_size) { + struct ceph_client *cl = ceph_inode_to_client(&ci->netfs.inode); + /* * 4 bytes for the length, and additional 4 bytes per each xattr name, * 4 bytes per each value @@ -854,9 +889,8 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, int size = 4 + ci->i_xattrs.count*(4 + 4) + ci->i_xattrs.names_size + ci->i_xattrs.vals_size; - dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n", - ci->i_xattrs.count, ci->i_xattrs.names_size, - ci->i_xattrs.vals_size); + doutc(cl, "c=%d names.size=%d vals.size=%d\n", ci->i_xattrs.count, + ci->i_xattrs.names_size, ci->i_xattrs.vals_size); if (name_size) size += 4 + 4 + name_size + val_size; @@ -872,12 +906,14 @@ static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size, */ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci) { + struct inode *inode = &ci->netfs.inode; + struct ceph_client *cl = ceph_inode_to_client(inode); struct rb_node *p; struct ceph_inode_xattr *xattr = NULL; struct ceph_buffer *old_blob = NULL; void *dest; - dout("__build_xattrs_blob %p\n", &ci->netfs.inode); + doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (ci->i_xattrs.dirty) { int need = __get_required_blob_size(ci, 0, 0); @@ -935,6 +971,7 @@ static inline int __get_request_mask(struct inode *in) { ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_xattr *xattr; struct ceph_vxattr *vxattr; @@ -973,8 +1010,9 @@ handle_non_vxattrs: req_mask = __get_request_mask(inode); spin_lock(&ci->i_ceph_lock); - dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name, - ci->i_xattrs.version, ci->i_xattrs.index_version); + doutc(cl, "%p %llx.%llx name '%s' ver=%lld index_ver=%lld\n", inode, + ceph_vinop(inode), name, ci->i_xattrs.version, + ci->i_xattrs.index_version); if (ci->i_xattrs.version == 0 || !((req_mask & CEPH_CAP_XATTR_SHARED) || @@ -983,8 +1021,9 @@ handle_non_vxattrs: /* security module gets xattr while filling trace */ if (current->journal_info) { - pr_warn_ratelimited("sync getxattr %p " - "during filling trace\n", inode); + pr_warn_ratelimited_client(cl, + "sync %p %llx.%llx during filling trace\n", + inode, ceph_vinop(inode)); return -EBUSY; } @@ -1026,14 +1065,16 @@ out: ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = d_inode(dentry); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); bool len_only = (size == 0); u32 namelen; int err; spin_lock(&ci->i_ceph_lock); - dout("listxattr %p ver=%lld index_ver=%lld\n", inode, - ci->i_xattrs.version, ci->i_xattrs.index_version); + doutc(cl, "%p %llx.%llx ver=%lld index_ver=%lld\n", inode, + ceph_vinop(inode), ci->i_xattrs.version, + ci->i_xattrs.index_version); if (ci->i_xattrs.version == 0 || !__ceph_caps_issued_mask_metric(ci, CEPH_CAP_XATTR_SHARED, 1)) { @@ -1067,7 +1108,8 @@ out: static int ceph_sync_setxattr(struct inode *inode, const char *name, const char *value, size_t size, int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_fs_client *fsc = ceph_sb_to_fs_client(inode->i_sb); + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1092,7 +1134,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, flags |= CEPH_XATTR_REMOVE; } - dout("setxattr value size: %zu\n", size); + doutc(cl, "name %s value size %zu\n", name, size); /* do request */ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -1121,10 +1163,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, req->r_num_caps = 1; req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + doutc(cl, "xattr.ver (before): %lld\n", ci->i_xattrs.version); err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); - dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); + doutc(cl, "xattr.ver (after): %lld\n", ci->i_xattrs.version); out: if (pagelist) @@ -1135,9 +1177,10 @@ out: int __ceph_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { + struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf = NULL; struct ceph_buffer *old_blob = NULL; int issued; @@ -1193,9 +1236,9 @@ retry: required_blob_size = __get_required_blob_size(ci, name_len, val_len); if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) || (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) { - dout("%s do sync setxattr: version: %llu size: %d max: %llu\n", - __func__, ci->i_xattrs.version, required_blob_size, - mdsc->mdsmap->m_max_xattr_size); + doutc(cl, "sync version: %llu size: %d max: %llu\n", + ci->i_xattrs.version, required_blob_size, + mdsc->mdsmap->m_max_xattr_size); goto do_sync; } @@ -1209,8 +1252,8 @@ retry: } } - dout("setxattr %p name '%s' issued %s\n", inode, name, - ceph_cap_string(issued)); + doutc(cl, "%p %llx.%llx name '%s' issued %s\n", inode, + ceph_vinop(inode), name, ceph_cap_string(issued)); __build_xattrs(inode); if (!ci->i_xattrs.prealloc_blob || @@ -1219,7 +1262,8 @@ retry: spin_unlock(&ci->i_ceph_lock); ceph_buffer_put(old_blob); /* Shouldn't be required */ - dout(" pre-allocating new blob size=%d\n", required_blob_size); + doutc(cl, " pre-allocating new blob size=%d\n", + required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) goto do_sync_unlocked; @@ -1238,7 +1282,7 @@ retry: dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, &prealloc_cf); ci->i_xattrs.dirty = true; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } spin_unlock(&ci->i_ceph_lock); @@ -1258,8 +1302,9 @@ do_sync_unlocked: /* security module set xattr while filling trace */ if (current->journal_info) { - pr_warn_ratelimited("sync setxattr %p " - "during filling trace\n", inode); + pr_warn_ratelimited_client(cl, + "sync %p %llx.%llx during filling trace\n", + inode, ceph_vinop(inode)); err = -EBUSY; } else { err = ceph_sync_setxattr(inode, name, value, size, flags); @@ -1408,6 +1453,9 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) #ifdef CONFIG_CEPH_FS_SECURITY_LABEL security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen); #endif +#ifdef CONFIG_FS_ENCRYPTION + kfree(as_ctx->fscrypt_auth); +#endif if (as_ctx->pagelist) ceph_pagelist_release(as_ctx->pagelist); } @@ -1416,7 +1464,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) * List of handlers for synthetic system.* attributes. Other * attributes are handled directly. */ -const struct xattr_handler *ceph_xattr_handlers[] = { +const struct xattr_handler * const ceph_xattr_handlers[] = { &ceph_other_xattr_handler, NULL, }; diff --git a/fs/char_dev.c b/fs/char_dev.c index 950b6919fb87..57cc096c498a 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -25,7 +25,7 @@ #include "internal.h" -static struct kobj_map *cdev_map; +static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); @@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p) struct module *owner = p->owner; struct kobject *kobj; - if (owner && !try_module_get(owner)) + if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 903ca8fa4b9b..1d2dac95f86a 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -123,11 +123,14 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) - inode->i_atime = coda_to_timespec64(attr->va_atime); + inode_set_atime_to_ts(inode, + coda_to_timespec64(attr->va_atime)); if (attr->va_mtime.tv_sec != -1) - inode->i_mtime = coda_to_timespec64(attr->va_mtime); + inode_set_mtime_to_ts(inode, + coda_to_timespec64(attr->va_mtime)); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = coda_to_timespec64(attr->va_ctime); + inode_set_ctime_to_ts(inode, + coda_to_timespec64(attr->va_ctime)); } diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 1b960de2bf39..4e552ba7bd43 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir) /* optimistically we can also act as if our nose bleeds. The * granularity of the mtime is coarse anyways so we might actually be * right most of the time. Note: we only do this for directories. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); #endif } diff --git a/fs/coda/file.c b/fs/coda/file.c index 12b26bd13564..16acc58311ea 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; - coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); + inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode)); inode_unlock(coda_inode); file_end_write(host_file); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index d661e6cf17ac..0c7c2528791e 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -256,7 +256,8 @@ int coda_getattr(struct mnt_idmap *idmap, const struct path *path, { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, + d_inode(path->dentry), stat); return err; } @@ -269,7 +270,7 @@ int coda_setattr(struct mnt_idmap *idmap, struct dentry *de, memset(&vattr, 0, sizeof(vattr)); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); coda_iattr_to_vattr(iattr, &vattr); vattr.va_type = C_VNON; /* cannot set type */ diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 1c15edbe70ff..dcc22f593e43 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -88,8 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) @@ -97,9 +96,9 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_mode = iattr->ia_mode; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; + inode_set_atime_to_ts(inode, iattr->ia_atime); + inode_set_mtime_to_ts(inode, iattr->ia_mtime); + inode_set_ctime_to_ts(inode, iattr->ia_ctime); } struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, @@ -172,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode) return ERR_PTR(-ENOMEM); p_inode = d_inode(dentry->d_parent); - p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode); + inode_set_mtime_to_ts(p_inode, inode_set_ctime_current(p_inode)); configfs_set_inode_lock_class(sd, inode); return inode; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 27c6597aa1be..60dbfa0f8805 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -133,7 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb, } /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime))); /* inode->i_nlink is left 1 - arguably wrong for directories, but it's the best we can do without reading the directory contents. 1 yields the right result in GNU find, even @@ -485,12 +486,16 @@ static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + generic_shutdown_super(sb); + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); - kill_mtd_super(sb); + put_mtd_device(sb->s_mtd); + sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { - kill_block_super(sb); + sync_blockdev(sb->s_bdev); + bdev_release(sb->s_bdev_handle); } kfree(sbi); } diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 62e1a3dd8357..0ad8c30b8fa5 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -111,10 +111,14 @@ out: int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const unsigned int blockbits = inode->i_blkbits; - const unsigned int blocksize = 1 << blockbits; - const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits; - const unsigned int blocks_per_page = 1 << blocks_per_page_bits; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const unsigned int du_bits = ci->ci_data_unit_bits; + const unsigned int du_size = 1U << du_bits; + const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; + const unsigned int du_per_page = 1U << du_per_page_bits; + u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits); + u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits); + sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; unsigned int i; @@ -130,8 +134,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, len); BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); - nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), - (len + blocks_per_page - 1) >> blocks_per_page_bits); + nr_pages = min_t(u64, ARRAY_SIZE(pages), + (du_remaining + du_per_page - 1) >> du_per_page_bits); /* * We need at least one page for ciphertext. Allocate the first one @@ -154,21 +158,22 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS); do { - bio->bi_iter.bi_sector = pblk << (blockbits - 9); + bio->bi_iter.bi_sector = sector; i = 0; offset = 0; do { - err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), pages[i], - blocksize, offset, GFP_NOFS); + err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index, + ZERO_PAGE(0), pages[i], + du_size, offset, + GFP_NOFS); if (err) goto out; - lblk++; - pblk++; - len--; - offset += blocksize; - if (offset == PAGE_SIZE || len == 0) { + du_index++; + sector += 1U << (du_bits - SECTOR_SHIFT); + du_remaining--; + offset += du_size; + if (offset == PAGE_SIZE || du_remaining == 0) { ret = bio_add_page(bio, pages[i++], offset, 0); if (WARN_ON_ONCE(ret != offset)) { err = -EIO; @@ -176,13 +181,13 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, } offset = 0; } - } while (i != nr_pages && len != 0); + } while (i != nr_pages && du_remaining != 0); err = submit_bio_wait(bio); if (err) goto out; bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); - } while (len != 0); + } while (du_remaining != 0); err = 0; out: bio_put(bio); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6a837e4b80dc..328470d40dec 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); -struct kmem_cache *fscrypt_info_cachep; +struct kmem_cache *fscrypt_inode_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) { @@ -49,6 +49,13 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) { + if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) { + /* + * Oops, the filesystem called a function that uses the bounce + * page pool, but it didn't set needs_bounce_pages. + */ + return NULL; + } return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); } @@ -70,44 +77,44 @@ void fscrypt_free_bounce_page(struct page *bounce_page) EXPORT_SYMBOL(fscrypt_free_bounce_page); /* - * Generate the IV for the given logical block number within the given file. - * For filenames encryption, lblk_num == 0. + * Generate the IV for the given data unit index within the given file. + * For filenames encryption, index == 0. * * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks() * needs to know about any IV generation methods where the low bits of IV don't - * simply contain the lblk_num (e.g., IV_INO_LBLK_32). + * simply contain the data unit index (e.g., IV_INO_LBLK_32). */ -void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, - const struct fscrypt_info *ci) +void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, + const struct fscrypt_inode_info *ci) { u8 flags = fscrypt_policy_flags(&ci->ci_policy); memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { - WARN_ON_ONCE(lblk_num > U32_MAX); + WARN_ON_ONCE(index > U32_MAX); WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); - lblk_num |= (u64)ci->ci_inode->i_ino << 32; + index |= (u64)ci->ci_inode->i_ino << 32; } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { - WARN_ON_ONCE(lblk_num > U32_MAX); - lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); + WARN_ON_ONCE(index > U32_MAX); + index = (u32)(ci->ci_hashed_ino + index); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE); } - iv->lblk_num = cpu_to_le64(lblk_num); + iv->index = cpu_to_le64(index); } -/* Encrypt or decrypt a single filesystem block of file contents */ -int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, - u64 lblk_num, struct page *src_page, - struct page *dest_page, unsigned int len, - unsigned int offs, gfp_t gfp_flags) +/* Encrypt or decrypt a single "data unit" of file contents. */ +int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, + fscrypt_direction_t rw, u64 index, + struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags) { union fscrypt_iv iv; struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; - struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; int res = 0; @@ -116,7 +123,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0)) return -EINVAL; - fscrypt_generate_iv(&iv, lblk_num, ci); + fscrypt_generate_iv(&iv, index, ci); req = skcipher_request_alloc(tfm, gfp_flags); if (!req) @@ -137,28 +144,29 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - fscrypt_err(inode, "%scryption failed for block %llu: %d", - (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res); + fscrypt_err(ci->ci_inode, + "%scryption failed for data unit %llu: %d", + (rw == FS_DECRYPT ? "De" : "En"), index, res); return res; } return 0; } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a - * pagecache page - * @page: The locked pagecache page containing the block(s) to encrypt - * @len: Total size of the block(s) to encrypt. Must be a nonzero - * multiple of the filesystem's block size. - * @offs: Byte offset within @page of the first block to encrypt. Must be - * a multiple of the filesystem's block size. - * @gfp_flags: Memory allocation flags. See details below. + * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page + * @page: the locked pagecache page containing the data to encrypt + * @len: size of the data to encrypt, in bytes + * @offs: offset within @page of the data to encrypt, in bytes + * @gfp_flags: memory allocation flags; see details below + * + * This allocates a new bounce page and encrypts the given data into it. The + * length and offset of the data must be aligned to the file's crypto data unit + * size. Alignment to the filesystem block size fulfills this requirement, as + * the filesystem block size is always a multiple of the data unit size. * - * A new bounce page is allocated, and the specified block(s) are encrypted into - * it. In the bounce page, the ciphertext block(s) will be located at the same - * offsets at which the plaintext block(s) were located in the source page; any - * other parts of the bounce page will be left uninitialized. However, normally - * blocksize == PAGE_SIZE and the whole page is encrypted at once. + * In the bounce page, the ciphertext data will be located at the same offset at + * which the plaintext data was located in the source page. Any other parts of + * the bounce page will be left uninitialized. * * This is for use by the filesystem's ->writepages() method. * @@ -176,28 +184,29 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, { const struct inode *inode = page->mapping->host; - const unsigned int blockbits = inode->i_blkbits; - const unsigned int blocksize = 1 << blockbits; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const unsigned int du_bits = ci->ci_data_unit_bits; + const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; - u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) + - (offs >> blockbits); + u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) + + (offs >> du_bits); unsigned int i; int err; if (WARN_ON_ONCE(!PageLocked(page))) return ERR_PTR(-EINVAL); - if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) return ERR_PTR(-EINVAL); ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags); if (!ciphertext_page) return ERR_PTR(-ENOMEM); - for (i = offs; i < offs + len; i += blocksize, lblk_num++) { - err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, - page, ciphertext_page, - blocksize, i, gfp_flags); + for (i = offs; i < offs + len; i += du_size, index++) { + err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, + page, ciphertext_page, + du_size, i, gfp_flags); if (err) { fscrypt_free_bounce_page(ciphertext_page); return ERR_PTR(err); @@ -224,30 +233,33 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); * arbitrary page, not necessarily in the original pagecache page. The @inode * and @lblk_num must be specified, as they can't be determined from @page. * + * This is not compatible with fscrypt_operations::supports_subblock_data_units. + * * Return: 0 on success; -errno on failure */ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { - return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page, - len, offs, gfp_flags); + if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) + return -EOPNOTSUPP; + return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, + lblk_num, page, page, len, offs, + gfp_flags); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** - * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a - * pagecache folio - * @folio: The locked pagecache folio containing the block(s) to decrypt - * @len: Total size of the block(s) to decrypt. Must be a nonzero - * multiple of the filesystem's block size. - * @offs: Byte offset within @folio of the first block to decrypt. Must be - * a multiple of the filesystem's block size. + * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio + * @folio: the pagecache folio containing the data to decrypt + * @len: size of the data to decrypt, in bytes + * @offs: offset within @folio of the data to decrypt, in bytes * - * The specified block(s) are decrypted in-place within the pagecache folio, - * which must still be locked and not uptodate. - * - * This is for use by the filesystem's ->readahead() method. + * Decrypt data that has just been read from an encrypted file. The data must + * be located in a pagecache folio that is still locked and not yet uptodate. + * The length and offset of the data must be aligned to the file's crypto data + * unit size. Alignment to the filesystem block size fulfills this requirement, + * as the filesystem block size is always a multiple of the data unit size. * * Return: 0 on success; -errno on failure */ @@ -255,25 +267,26 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const unsigned int blockbits = inode->i_blkbits; - const unsigned int blocksize = 1 << blockbits; - u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) + - (offs >> blockbits); + const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const unsigned int du_bits = ci->ci_data_unit_bits; + const unsigned int du_size = 1U << du_bits; + u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + + (offs >> du_bits); size_t i; int err; if (WARN_ON_ONCE(!folio_test_locked(folio))) return -EINVAL; - if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) return -EINVAL; - for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + for (i = offs; i < offs + len; i += du_size, index++) { struct page *page = folio_page(folio, i >> PAGE_SHIFT); - err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, - page, blocksize, i & ~PAGE_MASK, - GFP_NOFS); + err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page, + page, du_size, i & ~PAGE_MASK, + GFP_NOFS); if (err) return err; } @@ -295,14 +308,19 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks); * arbitrary page, not necessarily in the original pagecache page. The @inode * and @lblk_num must be specified, as they can't be determined from @page. * + * This is not compatible with fscrypt_operations::supports_subblock_data_units. + * * Return: 0 on success; -errno on failure */ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { - return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page, - len, offs, GFP_NOFS); + if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) + return -EOPNOTSUPP; + return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, + lblk_num, page, page, len, offs, + GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); @@ -325,7 +343,7 @@ int fscrypt_initialize(struct super_block *sb) return 0; /* No need to allocate a bounce page pool if this FS won't use it. */ - if (sb->s_cop->flags & FS_CFLG_OWN_PAGES) + if (!sb->s_cop->needs_bounce_pages) return 0; mutex_lock(&fscrypt_init_mutex); @@ -391,18 +409,19 @@ static int __init fscrypt_init(void) if (!fscrypt_read_workqueue) goto fail; - fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); - if (!fscrypt_info_cachep) + fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info, + SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_inode_info_cachep) goto fail_free_queue; err = fscrypt_init_keyring(); if (err) - goto fail_free_info; + goto fail_free_inode_info; return 0; -fail_free_info: - kmem_cache_destroy(fscrypt_info_cachep); +fail_free_inode_info: + kmem_cache_destroy(fscrypt_inode_info_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6eae3f12ad50..7b3fc189593a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; struct scatterlist sg; @@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; int res; @@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = dir->i_crypt_info; WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 2d63da48635a..1892356cf924 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -47,7 +47,8 @@ struct fscrypt_context_v2 { u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; - u8 __reserved[4]; + u8 log2_data_unit_size; + u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; @@ -165,6 +166,26 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } +static inline int +fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy, + const struct inode *inode) +{ + return policy->log2_data_unit_size ?: inode->i_blkbits; +} + +static inline int +fscrypt_policy_du_bits(const union fscrypt_policy *policy, + const struct inode *inode) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return inode->i_blkbits; + case FSCRYPT_POLICY_V2: + return fscrypt_policy_v2_du_bits(&policy->v2, inode); + } + BUG(); +} + /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. @@ -189,18 +210,18 @@ struct fscrypt_prepared_key { }; /* - * fscrypt_info - the "encryption key" for an inode + * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ -struct fscrypt_info { +struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; - /* True if ci_enc_key should be freed when this fscrypt_info is freed */ + /* True if ci_enc_key should be freed when this struct is freed */ bool ci_owns_key; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT @@ -212,6 +233,16 @@ struct fscrypt_info { #endif /* + * log2 of the data unit size (granularity of contents encryption) of + * this file. This is computable from ci_policy and ci_inode but is + * cached here for efficiency. Only used for regular files. + */ + u8 ci_data_unit_bits; + + /* Cached value: log2 of number of data units per FS block */ + u8 ci_data_units_per_block_bits; + + /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ @@ -263,12 +294,13 @@ typedef enum { } fscrypt_direction_t; /* crypto.c */ -extern struct kmem_cache *fscrypt_info_cachep; +extern struct kmem_cache *fscrypt_inode_info_cachep; int fscrypt_initialize(struct super_block *sb); -int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, - u64 lblk_num, struct page *src_page, - struct page *dest_page, unsigned int len, - unsigned int offs, gfp_t gfp_flags); +int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, + fscrypt_direction_t rw, u64 index, + struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold @@ -283,8 +315,8 @@ fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); union fscrypt_iv { struct { - /* logical block number within the file */ - __le64 lblk_num; + /* zero-based index of data unit within the file */ + __le64 index; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; @@ -293,8 +325,18 @@ union fscrypt_iv { __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; -void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, - const struct fscrypt_info *ci); +void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, + const struct fscrypt_inode_info *ci); + +/* + * Return the number of bits used by the maximum file data unit index that is + * possible on the given filesystem, using the given log2 data unit size. + */ +static inline int +fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits) +{ + return fls64(sb->s_maxbytes - 1) - du_bits; +} /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, @@ -332,17 +374,17 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT -int fscrypt_select_encryption_impl(struct fscrypt_info *ci); +int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci); static inline bool -fscrypt_using_inline_encryption(const struct fscrypt_info *ci) +fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, - const struct fscrypt_info *ci); + const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); @@ -353,7 +395,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb, */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s @@ -370,13 +412,13 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ -static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { return 0; } static inline bool -fscrypt_using_inline_encryption(const struct fscrypt_info *ci) +fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return false; } @@ -384,7 +426,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci) static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; @@ -398,7 +440,7 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb, static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } @@ -433,8 +475,28 @@ struct fscrypt_master_key_secret { * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the - * filesystem and can be used to "unlock" the encrypted files which were - * encrypted with it. + * filesystem. There are three high-level states that a key can be in: + * + * FSCRYPT_KEY_STATUS_PRESENT + * Key is fully usable; it can be used to unlock inodes that are encrypted + * with it (this includes being able to create new inodes). ->mk_present + * indicates whether the key is in this state. ->mk_secret exists, the key + * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present. + * + * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED + * Removal of this key has been initiated, but some inodes that were + * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped, + * and the key can no longer be used to unlock inodes. Unlike ABSENT, the + * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and + * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes. + * + * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty, + * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key. + * + * FSCRYPT_KEY_STATUS_ABSENT + * Key is fully removed. The key is no longer in the keyring, + * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is + * wiped, and the key can no longer be used to unlock inodes. */ struct fscrypt_master_key { @@ -444,7 +506,7 @@ struct fscrypt_master_key { */ struct hlist_node mk_node; - /* Semaphore that protects ->mk_secret and ->mk_users */ + /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */ struct rw_semaphore mk_sem; /* @@ -454,8 +516,8 @@ struct fscrypt_master_key { * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * - * There is one active ref associated with ->mk_secret being present, - * and one active ref for each inode in ->mk_decrypted_inodes. + * There is one active ref associated with ->mk_present being true, and + * one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, @@ -467,17 +529,10 @@ struct fscrypt_master_key { struct rcu_head mk_rcu_head; /* - * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is - * executed, this is wiped and no new inodes can be unlocked with this - * key; however, there may still be inodes in ->mk_decrypted_inodes - * which could not be evicted. As long as some inodes still remain, - * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or - * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. + * The secret key material. Wiped as soon as it is no longer needed; + * for details, see the fscrypt_master_key struct comment. * - * While ->mk_secret is present, one ref in ->mk_active_refs is held. - * - * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs - * associated with this field is protected by ->mk_sem as well. + * Locking: protected by ->mk_sem. */ struct fscrypt_master_key_secret mk_secret; @@ -500,7 +555,7 @@ struct fscrypt_master_key { * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic - * search+insert along with proper synchronization with ->mk_secret.) + * search+insert along with proper synchronization with other fields.) */ struct key *mk_users; @@ -523,20 +578,17 @@ struct fscrypt_master_key { siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; -} __randomize_layout; - -static inline bool -is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) -{ /* - * The READ_ONCE() is only necessary for fscrypt_drop_inode(). - * fscrypt_drop_inode() runs in atomic context, so it can't take the key - * semaphore and thus 'secret' can change concurrently which would be a - * data race. But fscrypt_drop_inode() only need to know whether the - * secret *was* present at the time of check, so READ_ONCE() suffices. + * Whether this key is in the "present" state, i.e. fully usable. For + * details, see the fscrypt_master_key struct comment. + * + * Locking: protected by ->mk_sem, but can be read locklessly using + * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers + * are possible. */ - return READ_ONCE(secret->size) != 0; -} + bool mk_present; + +} __randomize_layout; static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) @@ -598,17 +650,18 @@ struct fscrypt_mode { extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, const struct fscrypt_info *ci); + const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); -int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); +int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, + const u8 *raw_key); -int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, +int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); -void fscrypt_hash_inode_number(struct fscrypt_info *ci, +void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); @@ -643,10 +696,11 @@ static inline int fscrypt_require_key(struct inode *inode) void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); -int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, +int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key); -int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); +int fscrypt_setup_v1_file_key_via_subscribed_keyrings( + struct fscrypt_inode_info *ci); /* policy.c */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 6238dbcadcad..52504dd478d3 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { - struct fscrypt_info *ci; + struct fscrypt_inode_info *ci; struct fscrypt_master_key *mk; int err; @@ -187,7 +187,7 @@ int fscrypt_prepare_setflags(struct inode *inode, return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); - if (is_master_key_secret_present(&mk->mk_secret)) + if (mk->mk_present) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 8bfb3ce86476..b4002aea7cdb 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -39,11 +39,11 @@ static struct block_device **fscrypt_get_devices(struct super_block *sb, return devs; } -static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) +static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_inode_info *ci) { - struct super_block *sb = ci->ci_inode->i_sb; + const struct super_block *sb = ci->ci_inode->i_sb; unsigned int flags = fscrypt_policy_flags(&ci->ci_policy); - int ino_bits = 64, lblk_bits = 64; + int dun_bits; if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) return offsetofend(union fscrypt_iv, nonce); @@ -54,10 +54,9 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) return sizeof(__le32); - /* Default case: IVs are just the file logical block number */ - if (sb->s_cop->get_ino_and_lblk_bits) - sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - return DIV_ROUND_UP(lblk_bits, 8); + /* Default case: IVs are just the file data unit index */ + dun_bits = fscrypt_max_file_dun_bits(sb, ci->ci_data_unit_bits); + return DIV_ROUND_UP(dun_bits, 8); } /* @@ -90,7 +89,7 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, } /* Enable inline encryption for this file if supported. */ -int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; @@ -129,7 +128,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) * crypto configuration that the file would use. */ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; - crypto_cfg.data_unit_size = sb->s_blocksize; + crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); devs = fscrypt_get_devices(sb, &num_devs); @@ -152,7 +151,7 @@ out_free_devs: int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; @@ -168,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, return -ENOMEM; err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, - fscrypt_get_dun_bytes(ci), sb->s_blocksize); + fscrypt_get_dun_bytes(ci), + 1U << ci->ci_data_unit_bits); if (err) { fscrypt_err(inode, "error %d initializing blk-crypto key", err); goto fail; @@ -232,13 +232,15 @@ bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); -static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num, +static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci, + u64 lblk_num, u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) { + u64 index = lblk_num << ci->ci_data_units_per_block_bits; union fscrypt_iv iv; int i; - fscrypt_generate_iv(&iv, lblk_num, ci); + fscrypt_generate_iv(&iv, index, ci); BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE); memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE); @@ -265,7 +267,7 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num, void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { - const struct fscrypt_info *ci; + const struct fscrypt_inode_info *ci; u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!fscrypt_inode_uses_inline_crypto(inode)) @@ -456,7 +458,7 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported); */ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { - const struct fscrypt_info *ci; + const struct fscrypt_inode_info *ci; u32 dun; if (!fscrypt_inode_uses_inline_crypto(inode)) diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 7cbb1fd872ac..f34a9b0b9e92 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -99,10 +99,10 @@ void fscrypt_put_master_key_activeref(struct super_block *sb, spin_unlock(&sb->s_master_keys->lock); /* - * ->mk_active_refs == 0 implies that ->mk_secret is not present and - * that ->mk_decrypted_inodes is empty. + * ->mk_active_refs == 0 implies that ->mk_present is false and + * ->mk_decrypted_inodes is empty. */ - WARN_ON_ONCE(is_master_key_secret_present(&mk->mk_secret)); + WARN_ON_ONCE(mk->mk_present); WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { @@ -121,6 +121,18 @@ void fscrypt_put_master_key_activeref(struct super_block *sb, fscrypt_put_master_key(mk); } +/* + * This transitions the key state from present to incompletely removed, and then + * potentially to absent (depending on whether inodes remain). + */ +static void fscrypt_initiate_key_removal(struct super_block *sb, + struct fscrypt_master_key *mk) +{ + WRITE_ONCE(mk->mk_present, false); + wipe_master_key_secret(&mk->mk_secret); + fscrypt_put_master_key_activeref(sb, mk); +} + static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) { if (spec->__reserved) @@ -234,14 +246,13 @@ void fscrypt_destroy_keyring(struct super_block *sb) * evicted, every key remaining in the keyring should * have an empty inode list, and should only still be in * the keyring due to the single active ref associated - * with ->mk_secret. There should be no structural refs - * beyond the one associated with the active ref. + * with ->mk_present. There should be no structural + * refs beyond the one associated with the active ref. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1); WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1); - WARN_ON_ONCE(!is_master_key_secret_present(&mk->mk_secret)); - wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(sb, mk); + WARN_ON_ONCE(!mk->mk_present); + fscrypt_initiate_key_removal(sb, mk); } } kfree_sensitive(keyring); @@ -439,7 +450,8 @@ static int add_new_master_key(struct super_block *sb, } move_master_key_secret(&mk->mk_secret, secret); - refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */ + mk->mk_present = true; + refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */ spin_lock(&keyring->lock); hlist_add_head_rcu(&mk->mk_node, @@ -478,11 +490,18 @@ static int add_existing_master_key(struct fscrypt_master_key *mk, return err; } - /* Re-add the secret if needed. */ - if (!is_master_key_secret_present(&mk->mk_secret)) { - if (!refcount_inc_not_zero(&mk->mk_active_refs)) + /* If the key is incompletely removed, make it present again. */ + if (!mk->mk_present) { + if (!refcount_inc_not_zero(&mk->mk_active_refs)) { + /* + * Raced with the last active ref being dropped, so the + * key has become, or is about to become, "absent". + * Therefore, we need to allocate a new key struct. + */ return KEY_DEAD; + } move_master_key_secret(&mk->mk_secret, secret); + WRITE_ONCE(mk->mk_present, true); } return 0; @@ -506,8 +525,8 @@ static int do_add_master_key(struct super_block *sb, err = add_new_master_key(sb, secret, mk_spec); } else { /* - * Found the key in ->s_master_keys. Re-add the secret if - * needed, and add the user to ->mk_users if needed. + * Found the key in ->s_master_keys. Add the user to ->mk_users + * if needed, and make the key "present" again if possible. */ down_write(&mk->mk_sem); err = add_existing_master_key(mk, secret); @@ -867,7 +886,7 @@ static void shrink_dcache_inode(struct inode *inode) static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) { - struct fscrypt_info *ci; + struct fscrypt_inode_info *ci; struct inode *inode; struct inode *toput_inode = NULL; @@ -917,7 +936,7 @@ static int check_for_busy_inodes(struct super_block *sb, /* select an example file to show for debugging purposes */ struct inode *inode = list_first_entry(&mk->mk_decrypted_inodes, - struct fscrypt_info, + struct fscrypt_inode_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; } @@ -989,9 +1008,8 @@ static int try_to_lock_encrypted_files(struct super_block *sb, * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" - * state (without the actual secret key) where it tracks the list of remaining - * inodes. Userspace can execute the ioctl again later to retry eviction, or - * alternatively can re-add the secret key again. + * state where it tracks the list of remaining inodes. Userspace can execute + * the ioctl again later to retry eviction, or alternatively can re-add the key. * * For more details, see the "Removing keys" section of * Documentation/filesystems/fscrypt.rst. @@ -1053,11 +1071,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) } } - /* No user claims remaining. Go ahead and wipe the secret. */ + /* No user claims remaining. Initiate removal of the key. */ err = -ENOKEY; - if (is_master_key_secret_present(&mk->mk_secret)) { - wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(sb, mk); + if (mk->mk_present) { + fscrypt_initiate_key_removal(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; @@ -1074,9 +1091,9 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) } /* * We return 0 if we successfully did something: removed a claim to the - * key, wiped the secret, or tried locking the files again. Users need - * to check the informational status flags if they care whether the key - * has been fully removed including all files locked. + * key, initiated removal of the key, or tried locking the files again. + * Users need to check the informational status flags if they care + * whether the key has been fully removed including all files locked. */ out_put_key: fscrypt_put_master_key(mk); @@ -1103,12 +1120,11 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); * Retrieve the status of an fscrypt master encryption key. * * We set ->status to indicate whether the key is absent, present, or - * incompletely removed. "Incompletely removed" means that the master key - * secret has been removed, but some files which had been unlocked with it are - * still in use. This field allows applications to easily determine the state - * of an encrypted directory without using a hack such as trying to open a - * regular file in it (which can confuse the "incompletely removed" state with - * absent or present). + * incompletely removed. (For an explanation of what these statuses mean and + * how they are represented internally, see struct fscrypt_master_key.) This + * field allows applications to easily determine the status of an encrypted + * directory without using a hack such as trying to open a regular file in it + * (which can confuse the "incompletely removed" status with absent or present). * * In addition, for v2 policy keys we allow applications to determine, via * ->status_flags and ->user_count, whether the key has been added by the @@ -1150,7 +1166,7 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) } down_read(&mk->mk_sem); - if (!is_master_key_secret_present(&mk->mk_secret)) { + if (!mk->mk_present) { arg.status = refcount_read(&mk->mk_active_refs) > 0 ? FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 361f41ef46c7..d71f7c799e79 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -148,7 +148,7 @@ err_free_tfm: * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, const struct fscrypt_info *ci) + const u8 *raw_key, const struct fscrypt_inode_info *ci) { struct crypto_skcipher *tfm; @@ -178,13 +178,14 @@ void fscrypt_destroy_prepared_key(struct super_block *sb, } /* Given a per-file encryption key, set up the file's crypto transform object */ -int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key) +int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, + const u8 *raw_key) { ci->ci_owns_key = true; return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci); } -static int setup_per_mode_enc_key(struct fscrypt_info *ci, +static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, struct fscrypt_prepared_key *keys, u8 hkdf_context, bool include_fs_uuid) @@ -265,7 +266,7 @@ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, return 0; } -int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, +int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { int err; @@ -279,7 +280,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } -void fscrypt_hash_inode_number(struct fscrypt_info *ci, +void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { WARN_ON_ONCE(ci->ci_inode->i_ino == 0); @@ -289,7 +290,7 @@ void fscrypt_hash_inode_number(struct fscrypt_info *ci, &mk->mk_ino_hash_key); } -static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, +static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk) { int err; @@ -329,7 +330,7 @@ unlock: return 0; } -static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, +static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, bool need_dirhash_key) { @@ -404,7 +405,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * still allow 512-bit master keys if the user chooses to use them, though.) */ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { unsigned int min_keysize; @@ -430,11 +431,12 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, * * If the master key is found in the filesystem-level keyring, then it is * returned in *mk_ret with its semaphore read-locked. This is needed to ensure - * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as - * multiple tasks may race to create an fscrypt_info for the same inode), and to - * synchronize the master key being removed with a new inode starting to use it. + * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes + * (as multiple tasks may race to create an fscrypt_inode_info for the same + * inode), and to synchronize the master key being removed with a new inode + * starting to use it. */ -static int setup_file_encryption_key(struct fscrypt_info *ci, +static int setup_file_encryption_key(struct fscrypt_inode_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { @@ -484,8 +486,8 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, } down_read(&mk->mk_sem); - /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ - if (!is_master_key_secret_present(&mk->mk_secret)) { + if (!mk->mk_present) { + /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */ err = -ENOKEY; goto out_release_key; } @@ -519,7 +521,7 @@ out_release_key: return err; } -static void put_crypt_info(struct fscrypt_info *ci) +static void put_crypt_info(struct fscrypt_inode_info *ci) { struct fscrypt_master_key *mk; @@ -537,8 +539,8 @@ static void put_crypt_info(struct fscrypt_info *ci) /* * Remove this inode from the list of inodes that were unlocked * with the master key. In addition, if we're removing the last - * inode from a master key struct that already had its secret - * removed, then complete the full removal of the struct. + * inode from an incompletely removed key, then complete the + * full removal of the key. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); @@ -546,7 +548,7 @@ static void put_crypt_info(struct fscrypt_info *ci) fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); - kmem_cache_free(fscrypt_info_cachep, ci); + kmem_cache_free(fscrypt_inode_info_cachep, ci); } static int @@ -555,7 +557,7 @@ fscrypt_setup_encryption_info(struct inode *inode, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], bool need_dirhash_key) { - struct fscrypt_info *crypt_info; + struct fscrypt_inode_info *crypt_info; struct fscrypt_mode *mode; struct fscrypt_master_key *mk = NULL; int res; @@ -564,7 +566,7 @@ fscrypt_setup_encryption_info(struct inode *inode, if (res) return res; - crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL); + crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; @@ -580,6 +582,11 @@ fscrypt_setup_encryption_info(struct inode *inode, WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; + crypt_info->ci_data_unit_bits = + fscrypt_policy_du_bits(&crypt_info->ci_policy, inode); + crypt_info->ci_data_units_per_block_bits = + inode->i_blkbits - crypt_info->ci_data_unit_bits; + res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; @@ -587,8 +594,8 @@ fscrypt_setup_encryption_info(struct inode *inode, /* * For existing inodes, multiple tasks may race to set ->i_crypt_info. * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with + * a RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { /* @@ -735,8 +742,8 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * - * Free the inode's fscrypt_info. Filesystems must call this when the inode is - * being evicted. An RCU grace period need not have elapsed yet. + * Free the inode's fscrypt_inode_info. Filesystems must call this when the + * inode is being evicted. An RCU grace period need not have elapsed yet. */ void fscrypt_put_encryption_info(struct inode *inode) { @@ -773,7 +780,7 @@ EXPORT_SYMBOL(fscrypt_free_inode); */ int fscrypt_drop_inode(struct inode *inode) { - const struct fscrypt_info *ci = fscrypt_get_info(inode); + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode); /* * If ci is NULL, then the inode doesn't have an encryption key set up @@ -794,13 +801,14 @@ int fscrypt_drop_inode(struct inode *inode) return 0; /* - * Note: since we aren't holding the key semaphore, the result here can + * We can't take ->mk_sem here, since this runs in atomic context. + * Therefore, ->mk_present can change concurrently, and our result may * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not * evicting while iput() is racing with the key being removed, since * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ - return !is_master_key_secret_present(&ci->ci_master_key->mk_secret); + return !READ_ONCE(ci->ci_master_key->mk_present); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 75dabd9b27f9..cf3b58ec32cc 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -20,8 +20,8 @@ * managed alongside the master keys in the filesystem-level keyring) */ -#include <crypto/algapi.h> #include <crypto/skcipher.h> +#include <crypto/utils.h> #include <keys/user-type.h> #include <linux/hashtable.h> #include <linux/scatterlist.h> @@ -178,7 +178,8 @@ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk) */ static struct fscrypt_direct_key * find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, - const u8 *raw_key, const struct fscrypt_info *ci) + const u8 *raw_key, + const struct fscrypt_inode_info *ci) { unsigned long hash_key; struct fscrypt_direct_key *dk; @@ -218,7 +219,7 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, /* Prepare to encrypt directly using the master key in the given mode */ static struct fscrypt_direct_key * -fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) +fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key) { struct fscrypt_direct_key *dk; int err; @@ -250,7 +251,7 @@ err_free_dk: } /* v1 policy, DIRECT_KEY: use the master key directly */ -static int setup_v1_file_key_direct(struct fscrypt_info *ci, +static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { struct fscrypt_direct_key *dk; @@ -264,7 +265,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci, } /* v1 policy, !DIRECT_KEY: derive the file's encryption key */ -static int setup_v1_file_key_derived(struct fscrypt_info *ci, +static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { u8 *derived_key; @@ -289,7 +290,8 @@ out: return err; } -int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key) +int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, + const u8 *raw_master_key) { if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) return setup_v1_file_key_direct(ci, raw_master_key); @@ -297,8 +299,10 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key) return setup_v1_file_key_derived(ci, raw_master_key); } -int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci) +int +fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci) { + const struct super_block *sb = ci->ci_inode->i_sb; struct key *key; const struct fscrypt_key *payload; int err; @@ -306,8 +310,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci) key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); - if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) { - key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix, + if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) { + key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index f4456ecb3f87..701259991277 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -118,12 +118,11 @@ static bool supported_direct_key_modes(const struct inode *inode, } static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, - const struct inode *inode, - const char *type, - int max_ino_bits, int max_lblk_bits) + const struct inode *inode) { + const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) + ? "IV_INO_LBLK_64" : "IV_INO_LBLK_32"; struct super_block *sb = inode->i_sb; - int ino_bits = 64, lblk_bits = 64; /* * IV_INO_LBLK_* exist only because of hardware limitations, and @@ -150,17 +149,29 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, type, sb->s_id); return false; } - if (sb->s_cop->get_ino_and_lblk_bits) - sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - if (ino_bits > max_ino_bits) { + + /* + * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit + * in 32 bits. In principle, IV_INO_LBLK_32 could support longer inode + * numbers because it hashes the inode number; however, currently the + * inode number is gotten from inode::i_ino which is 'unsigned long'. + * So for now the implementation limit is 32 bits. + */ + if (!sb->s_cop->has_32bit_inodes) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its inode numbers are too long", type, sb->s_id); return false; } - if (lblk_bits > max_lblk_bits) { + + /* + * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit + * indices fit in 32 bits. + */ + if (fscrypt_max_file_dun_bits(sb, + fscrypt_policy_v2_du_bits(policy, inode)) > 32) { fscrypt_warn(inode, - "Can't use %s policy on filesystem '%s' because its block numbers are too long", + "Can't use %s policy on filesystem '%s' because its maximum file size is too large", type, sb->s_id); return false; } @@ -233,25 +244,39 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + if (policy->log2_data_unit_size) { + if (!inode->i_sb->s_cop->supports_subblock_data_units) { + fscrypt_warn(inode, + "Filesystem does not support configuring crypto data unit size"); + return false; + } + if (policy->log2_data_unit_size > inode->i_blkbits || + policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) { + fscrypt_warn(inode, + "Unsupported log2_data_unit_size in encryption policy: %d", + policy->log2_data_unit_size); + return false; + } + if (policy->log2_data_unit_size != inode->i_blkbits && + (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { + /* + * Not safe to enable yet, as we need to ensure that DUN + * wraparound can only occur on a FS block boundary. + */ + fscrypt_warn(inode, + "Sub-block data units not yet supported with IV_INO_LBLK_32"); + return false; + } + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; - if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", - 32, 32)) - return false; - - /* - * IV_INO_LBLK_32 hashes the inode number, so in principle it can - * support any ino_bits. However, currently the inode number is gotten - * from inode::i_ino which is 'unsigned long'. So for now the - * implementation limit is 32 bits. - */ - if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && - !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", - 32, 32)) + if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) && + !supported_iv_ino_lblk_policy(policy, inode)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -330,6 +355,7 @@ static int fscrypt_new_context(union fscrypt_context *ctx_u, ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; + ctx->log2_data_unit_size = policy->log2_data_unit_size; memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); @@ -390,6 +416,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; + policy->log2_data_unit_size = ctx->log2_data_unit_size; memcpy(policy->__reserved, ctx->__reserved, sizeof(policy->__reserved)); memcpy(policy->master_key_identifier, @@ -405,11 +432,11 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, /* Retrieve an inode's encryption policy */ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) { - const struct fscrypt_info *ci; + const struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ret; - ci = fscrypt_get_info(inode); + ci = fscrypt_get_inode_info(inode); if (ci) { /* key available, use the cached policy */ *policy = ci->ci_policy; @@ -647,7 +674,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) /* * Both parent and child are encrypted, so verify they use the same - * encryption policy. Compare the fscrypt_info structs if the keys are + * encryption policy. Compare the cached policies if the keys are * available, otherwise retrieve and compare the fscrypt_contexts. * * Note that the fscrypt_context retrieval will be required frequently @@ -717,7 +744,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = inode->i_crypt_info; BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -742,7 +769,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = inode->i_crypt_info; union fscrypt_context ctx; int ctxsize; @@ -30,17 +30,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> -static inline unsigned int pe_order(enum page_entry_size pe_size) -{ - if (pe_size == PE_SIZE_PTE) - return PAGE_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PMD) - return PMD_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PUD) - return PUD_SHIFT - PAGE_SHIFT; - return ~0; -} - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -49,9 +38,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size) #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) -/* The order of a PMD entry */ -#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) - static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -426,23 +412,23 @@ static struct page *dax_busy_page(void *entry) return NULL; } -/* - * dax_lock_page - Lock the DAX entry corresponding to a page - * @page: The page whose entry we want to lock +/** + * dax_lock_folio - Lock the DAX entry corresponding to a folio + * @folio: The folio whose entry we want to lock * * Context: Process context. - * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could + * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could * not be locked. */ -dax_entry_t dax_lock_page(struct page *page) +dax_entry_t dax_lock_folio(struct folio *folio) { XA_STATE(xas, NULL, 0); void *entry; - /* Ensure page->mapping isn't freed while we look at it */ + /* Ensure folio->mapping isn't freed while we look at it */ rcu_read_lock(); for (;;) { - struct address_space *mapping = READ_ONCE(page->mapping); + struct address_space *mapping = READ_ONCE(folio->mapping); entry = NULL; if (!mapping || !dax_mapping(mapping)) @@ -461,11 +447,11 @@ dax_entry_t dax_lock_page(struct page *page) xas.xa = &mapping->i_pages; xas_lock_irq(&xas); - if (mapping != page->mapping) { + if (mapping != folio->mapping) { xas_unlock_irq(&xas); continue; } - xas_set(&xas, page->index); + xas_set(&xas, folio->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { rcu_read_unlock(); @@ -481,10 +467,10 @@ dax_entry_t dax_lock_page(struct page *page) return (dax_entry_t)entry; } -void dax_unlock_page(struct page *page, dax_entry_t cookie) +void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { - struct address_space *mapping = page->mapping; - XA_STATE(xas, &mapping->i_pages, page->index); + struct address_space *mapping = folio->mapping; + XA_STATE(xas, &mapping->i_pages, folio->index); if (S_ISCHR(mapping->host->i_mode)) return; @@ -1908,7 +1894,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @pe_size: Size of the page to fault in + * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system @@ -1918,17 +1904,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - switch (pe_size) { - case PE_SIZE_PTE: + if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); - case PE_SIZE_PMD: + else if (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); - default: + else return VM_FAULT_FALLBACK; - } } EXPORT_SYMBOL_GPL(dax_iomap_fault); @@ -1979,19 +1963,18 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted + * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ -vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, + pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - unsigned int order = pe_order(pe_size); size_t len = PAGE_SIZE << order; err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); diff --git a/fs/dcache.c b/fs/dcache.c index 52e6d5fdab6b..c82ae731df9a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -78,7 +78,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __read_mostly; +static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); @@ -96,9 +96,9 @@ EXPORT_SYMBOL(dotdot_name); * information, yet avoid using a prime hash-size or similar. */ -static unsigned int d_hash_shift __read_mostly; +static unsigned int d_hash_shift __ro_after_init; -static struct hlist_bl_head *dentry_hashtable __read_mostly; +static struct hlist_bl_head *dentry_hashtable __ro_after_init; static inline struct hlist_bl_head *d_hash(unsigned int hash) { @@ -1664,7 +1664,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; - printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? @@ -1673,7 +1673,6 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - WARN_ON(1); return D_WALK_CONTINUE; } @@ -3247,13 +3246,10 @@ void d_genocide(struct dentry *parent) d_walk(parent, parent, d_genocide_kill); } -EXPORT_SYMBOL(d_genocide); - -void d_tmpfile(struct file *file, struct inode *inode) +void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; - inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); @@ -3263,6 +3259,15 @@ void d_tmpfile(struct file *file, struct inode *inode) (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); +} +EXPORT_SYMBOL(d_mark_tmpfile); + +void d_tmpfile(struct file *file, struct inode *inode) +{ + struct dentry *dentry = file->f_path.dentry; + + inode_dec_link_count(inode); + d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); @@ -3327,7 +3332,7 @@ static void __init dcache_init(void) } /* SLAB cache for __getname() consumers */ -struct kmem_cache *names_cachep __read_mostly; +struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index b7711888dd17..5063434be0fc 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -84,6 +84,14 @@ int debugfs_file_get(struct dentry *dentry) struct debugfs_fsdata *fsd; void *d_fsd; + /* + * This could only happen if some debugfs user erroneously calls + * debugfs_file_get() on a dentry that isn't even a file, let + * them know about it. + */ + if (WARN_ON(!d_is_reg(dentry))) + return -EINVAL; + d_fsd = READ_ONCE(dentry->d_fsdata); if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { fsd = d_fsd; @@ -100,6 +108,8 @@ int debugfs_file_get(struct dentry *dentry) kfree(fsd); fsd = READ_ONCE(dentry->d_fsdata); } + INIT_LIST_HEAD(&fsd->cancellations); + mutex_init(&fsd->cancellations_mtx); } /* @@ -138,6 +148,86 @@ void debugfs_file_put(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_file_put); +/** + * debugfs_enter_cancellation - enter a debugfs cancellation + * @file: the file being accessed + * @cancellation: the cancellation object, the cancel callback + * inside of it must be initialized + * + * When a debugfs file is removed it needs to wait for all active + * operations to complete. However, the operation itself may need + * to wait for hardware or completion of some asynchronous process + * or similar. As such, it may need to be cancelled to avoid long + * waits or even deadlocks. + * + * This function can be used inside a debugfs handler that may + * need to be cancelled. As soon as this function is called, the + * cancellation's 'cancel' callback may be called, at which point + * the caller should proceed to call debugfs_leave_cancellation() + * and leave the debugfs handler function as soon as possible. + * Note that the 'cancel' callback is only ever called in the + * context of some kind of debugfs_remove(). + * + * This function must be paired with debugfs_leave_cancellation(). + */ +void debugfs_enter_cancellation(struct file *file, + struct debugfs_cancellation *cancellation) +{ + struct debugfs_fsdata *fsd; + struct dentry *dentry = F_DENTRY(file); + + INIT_LIST_HEAD(&cancellation->list); + + if (WARN_ON(!d_is_reg(dentry))) + return; + + if (WARN_ON(!cancellation->cancel)) + return; + + fsd = READ_ONCE(dentry->d_fsdata); + if (WARN_ON(!fsd || + ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) + return; + + mutex_lock(&fsd->cancellations_mtx); + list_add(&cancellation->list, &fsd->cancellations); + mutex_unlock(&fsd->cancellations_mtx); + + /* if we're already removing wake it up to cancel */ + if (d_unlinked(dentry)) + complete(&fsd->active_users_drained); +} +EXPORT_SYMBOL_GPL(debugfs_enter_cancellation); + +/** + * debugfs_leave_cancellation - leave cancellation section + * @file: the file being accessed + * @cancellation: the cancellation previously registered with + * debugfs_enter_cancellation() + * + * See the documentation of debugfs_enter_cancellation(). + */ +void debugfs_leave_cancellation(struct file *file, + struct debugfs_cancellation *cancellation) +{ + struct debugfs_fsdata *fsd; + struct dentry *dentry = F_DENTRY(file); + + if (WARN_ON(!d_is_reg(dentry))) + return; + + fsd = READ_ONCE(dentry->d_fsdata); + if (WARN_ON(!fsd || + ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) + return; + + mutex_lock(&fsd->cancellations_mtx); + if (!list_empty(&cancellation->list)) + list_del(&cancellation->list); + mutex_unlock(&fsd->cancellations_mtx); +} +EXPORT_SYMBOL_GPL(debugfs_leave_cancellation); + /* * Only permit access to world-readable files when the kernel is locked down. * We also need to exclude any file that has ways to write or alter it as root @@ -904,8 +994,52 @@ EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - /* This is really only for read-only strings */ - return -EINVAL; + struct dentry *dentry = F_DENTRY(file); + char *old, *new = NULL; + int pos = *ppos; + int r; + + r = debugfs_file_get(dentry); + if (unlikely(r)) + return r; + + old = *(char **)file->private_data; + + /* only allow strict concatenation */ + r = -EINVAL; + if (pos && pos != strlen(old)) + goto error; + + r = -E2BIG; + if (pos + count + 1 > PAGE_SIZE) + goto error; + + r = -ENOMEM; + new = kmalloc(pos + count + 1, GFP_KERNEL); + if (!new) + goto error; + + if (pos) + memcpy(new, old, pos); + + r = -EFAULT; + if (copy_from_user(new + pos, user_buf, count)) + goto error; + + new[pos + count] = '\0'; + strim(new); + + rcu_assign_pointer(*(char __rcu **)file->private_data, new); + synchronize_rcu(); + kfree(old); + + debugfs_file_put(dentry); + return count; + +error: + kfree(new); + debugfs_file_put(dentry); + return r; } static const struct file_operations fops_str = { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3f81f73c241a..034a617cb1a5 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -72,8 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); } return inode; } @@ -237,17 +236,25 @@ static const struct super_operations debugfs_super_operations = { static void debugfs_release_dentry(struct dentry *dentry) { - void *fsd = dentry->d_fsdata; + struct debugfs_fsdata *fsd = dentry->d_fsdata; - if (!((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) - kfree(dentry->d_fsdata); + if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) + return; + + /* check it wasn't a dir (no fsdata) or automount (no real_fops) */ + if (fsd && fsd->real_fops) { + WARN_ON(!list_empty(&fsd->cancellations)); + mutex_destroy(&fsd->cancellations_mtx); + } + + kfree(fsd); } static struct vfsmount *debugfs_automount(struct path *path) { - debugfs_automount_t f; - f = (debugfs_automount_t)path->dentry->d_fsdata; - return f(path->dentry, d_inode(path->dentry)->i_private); + struct debugfs_fsdata *fsd = path->dentry->d_fsdata; + + return fsd->automount(path->dentry, d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -635,13 +642,23 @@ struct dentry *debugfs_create_automount(const char *name, void *data) { struct dentry *dentry = start_creating(name, parent); + struct debugfs_fsdata *fsd; struct inode *inode; if (IS_ERR(dentry)) return dentry; + fsd = kzalloc(sizeof(*fsd), GFP_KERNEL); + if (!fsd) { + failed_creating(dentry); + return ERR_PTR(-ENOMEM); + } + + fsd->automount = f; + if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { failed_creating(dentry); + kfree(fsd); return ERR_PTR(-EPERM); } @@ -649,13 +666,14 @@ struct dentry *debugfs_create_automount(const char *name, if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", name); + kfree(fsd); return failed_creating(dentry); } make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; - dentry->d_fsdata = (void *)f; + dentry->d_fsdata = fsd; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); @@ -732,8 +750,37 @@ static void __debugfs_file_removed(struct dentry *dentry) fsd = READ_ONCE(dentry->d_fsdata); if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) return; - if (!refcount_dec_and_test(&fsd->active_users)) + + /* if we hit zero, just wait for all to finish */ + if (!refcount_dec_and_test(&fsd->active_users)) { wait_for_completion(&fsd->active_users_drained); + return; + } + + /* if we didn't hit zero, try to cancel any we can */ + while (refcount_read(&fsd->active_users)) { + struct debugfs_cancellation *c; + + /* + * Lock the cancellations. Note that the cancellations + * structs are meant to be on the stack, so we need to + * ensure we either use them here or don't touch them, + * and debugfs_leave_cancellation() will wait for this + * to be finished processing before exiting one. It may + * of course win and remove the cancellation, but then + * chances are we never even got into this bit, we only + * do if the refcount isn't zero already. + */ + mutex_lock(&fsd->cancellations_mtx); + while ((c = list_first_entry_or_null(&fsd->cancellations, + typeof(*c), list))) { + list_del_init(&c->list); + c->cancel(dentry, c->cancel_data); + } + mutex_unlock(&fsd->cancellations_mtx); + + wait_for_completion(&fsd->active_users_drained); + } } static void remove_one(struct dentry *victim) diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index 92af8ae31313..dae80c2a469e 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -7,6 +7,7 @@ #ifndef _DEBUGFS_INTERNAL_H_ #define _DEBUGFS_INTERNAL_H_ +#include <linux/list.h> struct file_operations; @@ -17,8 +18,18 @@ extern const struct file_operations debugfs_full_proxy_file_operations; struct debugfs_fsdata { const struct file_operations *real_fops; - refcount_t active_users; - struct completion active_users_drained; + union { + /* automount_fn is used when real_fops is NULL */ + debugfs_automount_t automount; + struct { + refcount_t active_users; + struct completion active_users_drained; + + /* protect cancellations */ + struct mutex cancellations_mtx; + struct list_head cancellations; + }; + }; }; /* diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index fe3db0eda8e4..c830261aa883 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb) } inode->i_ino = 2; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); @@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) if (!inode) goto fail; inode->i_ino = 1; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -534,12 +534,12 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) /** * devpts_pty_new -- create a new inode in /dev/pts/ - * @ptmx_inode: inode of the master - * @device: major+minor of the node to be created + * @fsi: Filesystem info for this instance. * @index: used as a name of the node * @priv: what's given back by devpts_get_priv * - * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. + * The dentry for the created inode is returned. + * Remove it from /dev/pts/ with devpts_pty_kill(). */ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { @@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); @@ -580,7 +580,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) /** * devpts_get_priv -- get private data for a slave - * @pts_inode: inode of the slave + * @dentry: dentry of the slave * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ @@ -593,7 +593,7 @@ void *devpts_get_priv(struct dentry *dentry) /** * devpts_pty_kill -- remove inode form /dev/pts/ - * @inode: inode of the slave to be removed + * @dentry: dentry of the slave to be removed * * This is an inverse operation of devpts_pty_new. */ diff --git a/fs/direct-io.c b/fs/direct-io.c index 7bc494ee56b9..20533266ade6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -151,7 +151,7 @@ struct dio { }; } ____cacheline_aligned_in_smp; -static struct kmem_cache *dio_cache __read_mostly; +static struct kmem_cache *dio_cache __ro_after_init; /* * How many pages are in the queue? diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 2beceff024e3..e55e0a2cd2e8 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -664,7 +664,7 @@ static ssize_t comm_addr_store(struct config_item *item, const char *buf, memcpy(addr, buf, len); - rv = dlm_lowcomms_addr(cm->nodeid, addr, len); + rv = dlm_midcomms_addr(cm->nodeid, addr, len); if (rv) { kfree(addr); return rv; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index a1aca41c49d0..42f332f46359 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -18,6 +18,7 @@ #include "dlm_internal.h" #include "midcomms.h" #include "lock.h" +#include "ast.h" #define DLM_DEBUG_BUF_LEN 4096 static char debug_buf[DLM_DEBUG_BUF_LEN]; @@ -365,6 +366,52 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s) unlock_rsb(r); } +static void print_format5_lock(struct seq_file *s, struct dlm_lkb *lkb) +{ + struct dlm_callback *cb; + + /* lkb_id lkb_flags mode flags sb_status sb_flags */ + + spin_lock(&lkb->lkb_cb_lock); + list_for_each_entry(cb, &lkb->lkb_callbacks, list) { + seq_printf(s, "%x %x %d %x %d %x\n", + lkb->lkb_id, + dlm_iflags_val(lkb), + cb->mode, + cb->flags, + cb->sb_status, + cb->sb_flags); + } + spin_unlock(&lkb->lkb_cb_lock); +} + +static void print_format5(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + + lock_rsb(r); + + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { + print_format5_lock(s, lkb); + if (seq_has_overflowed(s)) + goto out; + } + out: + unlock_rsb(r); +} + struct rsbtbl_iter { struct dlm_rsb *rsb; unsigned bucket; @@ -408,6 +455,13 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr) } print_format4(ri->rsb, seq); break; + case 5: + if (ri->header) { + seq_puts(seq, "lkb_id lkb_flags mode flags sb_status sb_flags\n"); + ri->header = 0; + } + print_format5(ri->rsb, seq); + break; } return 0; @@ -417,6 +471,7 @@ static const struct seq_operations format1_seq_ops; static const struct seq_operations format2_seq_ops; static const struct seq_operations format3_seq_ops; static const struct seq_operations format4_seq_ops; +static const struct seq_operations format5_seq_ops; static void *table_seq_start(struct seq_file *seq, loff_t *pos) { @@ -448,6 +503,8 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) ri->format = 3; if (seq->op == &format4_seq_ops) ri->format = 4; + if (seq->op == &format5_seq_ops) + ri->format = 5; tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; @@ -602,10 +659,18 @@ static const struct seq_operations format4_seq_ops = { .show = table_seq_show, }; +static const struct seq_operations format5_seq_ops = { + .start = table_seq_start, + .next = table_seq_next, + .stop = table_seq_stop, + .show = table_seq_show, +}; + static const struct file_operations format1_fops; static const struct file_operations format2_fops; static const struct file_operations format3_fops; static const struct file_operations format4_fops; +static const struct file_operations format5_fops; static int table_open1(struct inode *inode, struct file *file) { @@ -683,7 +748,21 @@ static int table_open4(struct inode *inode, struct file *file) struct seq_file *seq; int ret; - ret = seq_open(file, &format4_seq_ops); + ret = seq_open(file, &format5_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; /* the dlm_ls */ + return 0; +} + +static int table_open5(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &format5_seq_ops); if (ret) return ret; @@ -725,6 +804,14 @@ static const struct file_operations format4_fops = { .release = seq_release }; +static const struct file_operations format5_fops = { + .owner = THIS_MODULE, + .open = table_open5, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + /* * dump lkb's on the ls_waiters list */ @@ -793,6 +880,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_locks_dentry); debugfs_remove(ls->ls_debug_all_dentry); debugfs_remove(ls->ls_debug_toss_dentry); + debugfs_remove(ls->ls_debug_queued_asts_dentry); } static int dlm_state_show(struct seq_file *file, void *offset) @@ -885,7 +973,8 @@ void dlm_delete_debug_comms_file(void *ctx) void dlm_create_debug_file(struct dlm_ls *ls) { - char name[DLM_LOCKSPACE_LEN + 8]; + /* Reserve enough space for the longest file name */ + char name[DLM_LOCKSPACE_LEN + sizeof("_queued_asts")]; /* format 1 */ @@ -897,8 +986,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) /* format 2 */ - memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name); + snprintf(name, sizeof(name), "%s_locks", ls->ls_name); ls->ls_debug_locks_dentry = debugfs_create_file(name, 0644, @@ -908,8 +996,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) /* format 3 */ - memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_all", ls->ls_name); + snprintf(name, sizeof(name), "%s_all", ls->ls_name); ls->ls_debug_all_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -919,8 +1006,7 @@ void dlm_create_debug_file(struct dlm_ls *ls) /* format 4 */ - memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_toss", ls->ls_name); + snprintf(name, sizeof(name), "%s_toss", ls->ls_name); ls->ls_debug_toss_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -928,14 +1014,23 @@ void dlm_create_debug_file(struct dlm_ls *ls) ls, &format4_fops); - memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name); + snprintf(name, sizeof(name), "%s_waiters", ls->ls_name); ls->ls_debug_waiters_dentry = debugfs_create_file(name, 0644, dlm_root, ls, &waiters_fops); + + /* format 5 */ + + snprintf(name, sizeof(name), "%s_queued_asts", ls->ls_name); + + ls->ls_debug_queued_asts_dentry = debugfs_create_file(name, + 0644, + dlm_root, + ls, + &format5_fops); } void __init dlm_register_debugfs(void) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index fb1981654bb2..f6acba4310a7 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -58,7 +58,7 @@ void dlm_recover_dir_nodeid(struct dlm_ls *ls) up_read(&ls->ls_root_sem); } -int dlm_recover_directory(struct dlm_ls *ls) +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; char *b, *last_name = NULL; @@ -90,7 +90,7 @@ int dlm_recover_directory(struct dlm_ls *ls) } error = dlm_rcom_names(ls, memb->nodeid, - last_name, last_len); + last_name, last_len, seq); if (error) goto out_free; @@ -196,7 +196,8 @@ int dlm_recover_directory(struct dlm_ls *ls) return error; } -static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) +static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, const char *name, + int len) { struct dlm_rsb *r; uint32_t hash, bucket; @@ -232,7 +233,7 @@ static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len) for rsb's we're master of and whose directory node matches the requesting node. inbuf is the rsb name last sent, inlen is the name's length */ -void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, char *outbuf, int outlen, int nodeid) { struct list_head *list; @@ -245,9 +246,8 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, if (inlen > 1) { r = find_rsb_root(ls, inbuf, inlen); if (!r) { - inbuf[inlen - 1] = '\0'; - log_error(ls, "copy_master_names from %d start %d %s", - nodeid, inlen, inbuf); + log_error(ls, "copy_master_names from %d start %d %.*s", + nodeid, inlen, inlen, inbuf); goto out; } list = r->res_root_list.next; diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h index 03844d086be2..39ecb69d7ef3 100644 --- a/fs/dlm/dir.h +++ b/fs/dlm/dir.h @@ -15,9 +15,9 @@ int dlm_dir_nodeid(struct dlm_rsb *rsb); int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); void dlm_recover_dir_nodeid(struct dlm_ls *ls); -int dlm_recover_directory(struct dlm_ls *ls); -void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, - char *outbuf, int outlen, int nodeid); +int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq); +void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen, + char *outbuf, int outlen, int nodeid); #endif /* __DIR_DOT_H__ */ diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index c8156770205e..dfc444dad329 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -598,6 +598,7 @@ struct dlm_ls { struct dentry *ls_debug_locks_dentry; /* debugfs */ struct dentry *ls_debug_all_dentry; /* debugfs */ struct dentry *ls_debug_toss_dentry; /* debugfs */ + struct dentry *ls_debug_queued_asts_dentry; /* debugfs */ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index f511a9d7d416..652c51fbbf76 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -86,8 +86,8 @@ static int send_remove(struct dlm_rsb *r); static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms, bool local); -static int receive_extralen(struct dlm_message *ms); + const struct dlm_message *ms, bool local); +static int receive_extralen(const struct dlm_message *ms); static void do_purge(struct dlm_ls *ls, int nodeid, int pid); static void toss_rsb(struct kref *kref); @@ -984,8 +984,8 @@ static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_no * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0) */ -int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len, - unsigned int flags, int *r_nodeid, int *result) +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result) { struct dlm_rsb *r = NULL; uint32_t hash, b; @@ -1106,7 +1106,7 @@ static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash) } } -void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len) +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len) { struct dlm_rsb *r = NULL; uint32_t hash, b; @@ -1459,7 +1459,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) set RESEND and dlm_recover_waiters_post() */ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, - struct dlm_message *ms) + const struct dlm_message *ms) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int overlap_done = 0; @@ -1557,8 +1557,8 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype) /* Handles situations where we might be processing a "fake" or "local" reply in which we can't try to take waiters_mutex again. */ -static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static int remove_from_waiters_ms(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; @@ -1800,7 +1800,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) /* lkb is process copy (pc) */ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { int b; @@ -1907,7 +1907,7 @@ static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { set_lvb_lock_pc(r, lkb, ms); _grant_lock(r, lkb); @@ -1945,7 +1945,7 @@ static void munge_demoted(struct dlm_lkb *lkb) lkb->lkb_grmode = DLM_LOCK_NL; } -static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms) +static void munge_altmode(struct dlm_lkb *lkb, const struct dlm_message *ms) { if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) && ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) { @@ -3641,8 +3641,9 @@ static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv); } -static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, - int ret_nodeid, int rv) +static int send_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms_in, int ret_nodeid, + int rv) { struct dlm_rsb *r = &ls->ls_local_rsb; struct dlm_message *ms; @@ -3667,14 +3668,15 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, of message, unlike the send side where we can safely send everything about the lkb for any type of message */ -static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) +static void receive_flags(struct dlm_lkb *lkb, const struct dlm_message *ms) { lkb->lkb_exflags = le32_to_cpu(ms->m_exflags); dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags)); dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } -static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms, +static void receive_flags_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { if (local) @@ -3684,14 +3686,14 @@ static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms, dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags)); } -static int receive_extralen(struct dlm_message *ms) +static int receive_extralen(const struct dlm_message *ms) { return (le16_to_cpu(ms->m_header.h_length) - sizeof(struct dlm_message)); } static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { int len; @@ -3719,7 +3721,7 @@ static void fake_astfn(void *astparam) } static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); lkb->lkb_ownpid = le32_to_cpu(ms->m_pid); @@ -3741,7 +3743,7 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, } static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { if (lkb->lkb_status != DLM_LKSTS_GRANTED) return -EBUSY; @@ -3756,7 +3758,7 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, } static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_message *ms) + const struct dlm_message *ms) { if (receive_lvb(ls, lkb, ms)) return -ENOMEM; @@ -3766,7 +3768,7 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, /* We fill in the local-lkb fields with the info that send_xxxx_reply() uses to send a reply and that the remote end uses to process the reply. */ -static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms) +static void setup_local_lkb(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb = &ls->ls_local_lkb; lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid); @@ -3776,7 +3778,7 @@ static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms) /* This is called after the rsb is locked so that we can safely inspect fields in the lkb. */ -static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms) +static int validate_message(struct dlm_lkb *lkb, const struct dlm_message *ms) { int from = le32_to_cpu(ms->m_header.h_nodeid); int error = 0; @@ -3828,7 +3830,7 @@ out: return error; } -static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3907,7 +3909,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -3963,7 +3965,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4015,7 +4017,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4051,7 +4053,7 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) return error; } -static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_grant(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4082,7 +4084,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_bast(struct dlm_ls *ls, const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4110,7 +4112,7 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_lookup(struct dlm_ls *ls, const struct dlm_message *ms) { int len, error, ret_nodeid, from_nodeid, our_nodeid; @@ -4130,7 +4132,7 @@ static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) send_lookup_reply(ls, ms, ret_nodeid, error); } -static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms) { char name[DLM_RESNAME_MAXLEN+1]; struct dlm_rsb *r; @@ -4218,12 +4220,13 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) } } -static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms) { do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid)); } -static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_request_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4345,7 +4348,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) } static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, - struct dlm_message *ms, bool local) + const struct dlm_message *ms, bool local) { /* this is the value returned from do_convert() on the master */ switch (from_dlm_errno(le32_to_cpu(ms->m_result))) { @@ -4388,8 +4391,8 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, } } -static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_convert_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4412,7 +4415,8 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_convert_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4426,8 +4430,8 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_unlock_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4463,7 +4467,8 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_unlock_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4477,8 +4482,8 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms, - bool local) +static void _receive_cancel_reply(struct dlm_lkb *lkb, + const struct dlm_message *ms, bool local) { struct dlm_rsb *r = lkb->lkb_resource; int error; @@ -4515,7 +4520,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms, put_rsb(r); } -static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) +static int receive_cancel_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; int error; @@ -4529,7 +4535,8 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) return 0; } -static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) +static void receive_lookup_reply(struct dlm_ls *ls, + const struct dlm_message *ms) { struct dlm_lkb *lkb; struct dlm_rsb *r; @@ -4608,7 +4615,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) dlm_put_lkb(lkb); } -static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, +static void _receive_message(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq) { int error = 0, noent = 0; @@ -4744,7 +4751,7 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, requestqueue, to processing all the saved messages, to processing new messages as they arrive. */ -static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, +static void dlm_receive_message(struct dlm_ls *ls, const struct dlm_message *ms, int nodeid) { if (dlm_locking_stopped(ls)) { @@ -4767,7 +4774,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, /* This is called by dlm_recoverd to process messages that were saved on the requestqueue. */ -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq) { _receive_message(ls, ms, saved_seq); @@ -4778,9 +4785,9 @@ void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, standard locking activity) or an RCOM (recovery message sent as part of lockspace recovery). */ -void dlm_receive_buffer(union dlm_packet *p, int nodeid) +void dlm_receive_buffer(const union dlm_packet *p, int nodeid) { - struct dlm_header *hd = &p->header; + const struct dlm_header *hd = &p->header; struct dlm_ls *ls; int type = 0; @@ -5334,7 +5341,7 @@ static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, /* needs at least dlm_rcom + rcom_lock */ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, - struct dlm_rsb *r, struct dlm_rcom *rc) + struct dlm_rsb *r, const struct dlm_rcom *rc) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; @@ -5384,7 +5391,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, back the rcom_lock struct we got but with the remid field filled in. */ /* needs at least dlm_rcom + rcom_lock */ -int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; @@ -5393,6 +5401,9 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid); int error; + /* init rl_remid with rcom lock rl_remid */ + *rl_remid = rl->rl_remid; + if (rl->rl_parent_lkid) { error = -EOPNOTSUPP; goto out; @@ -5448,7 +5459,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) out_remid: /* this is the new value returned to the lock holder for saving in its process-copy lkb */ - rl->rl_remid = cpu_to_le32(lkb->lkb_id); + *rl_remid = cpu_to_le32(lkb->lkb_id); lkb->lkb_recover_seq = ls->ls_recover_seq; @@ -5459,12 +5470,13 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) if (error && error != -EEXIST) log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d", from_nodeid, remid, error); - rl->rl_result = cpu_to_le32(error); + *rl_result = cpu_to_le32(error); return error; } /* needs at least dlm_rcom + rcom_lock */ -int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq) { struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; struct dlm_rsb *r; @@ -5509,7 +5521,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid, result); - dlm_send_rcom_lock(r, lkb); + dlm_send_rcom_lock(r, lkb, seq); goto out; case -EEXIST: case 0: diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index aa5ad44d902b..b54e2cbbe6e2 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -12,11 +12,11 @@ #define __LOCK_DOT_H__ void dlm_dump_rsb(struct dlm_rsb *r); -void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len); +void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len); void dlm_print_lkb(struct dlm_lkb *lkb); -void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms, +void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms, uint32_t saved_seq); -void dlm_receive_buffer(union dlm_packet *p, int nodeid); +void dlm_receive_buffer(const union dlm_packet *p, int nodeid); int dlm_modes_compat(int mode1, int mode2); void dlm_put_rsb(struct dlm_rsb *r); void dlm_hold_rsb(struct dlm_rsb *r); @@ -25,8 +25,8 @@ void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); -int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, - unsigned int flags, int *r_nodeid, int *result); +int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name, + int len, unsigned int flags, int *r_nodeid, int *result); int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len, struct dlm_rsb **r_ret); @@ -36,8 +36,10 @@ void dlm_purge_mstcpy_locks(struct dlm_rsb *r); void dlm_recover_grant(struct dlm_ls *ls); int dlm_recover_waiters_post(struct dlm_ls *ls); void dlm_recover_waiters_pre(struct dlm_ls *ls); -int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); -int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + __le32 *rl_remid, __le32 *rl_result); +int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc, + uint64_t seq); int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9f14ea9f6322..67f8dd8a05ef 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -63,6 +63,7 @@ #include "config.h" #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000) +#define DLM_MAX_PROCESS_BUFFERS 24 #define NEEDED_RMEM (4*1024*1024) struct connection { @@ -194,6 +195,7 @@ static const struct dlm_proto_ops *dlm_proto_ops; #define DLM_IO_END 1 #define DLM_IO_EOF 2 #define DLM_IO_RESCHED 3 +#define DLM_IO_FLUSH 4 static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); @@ -202,6 +204,7 @@ static void process_dlm_messages(struct work_struct *work); static DECLARE_WORK(process_work, process_dlm_messages); static DEFINE_SPINLOCK(processqueue_lock); static bool process_dlm_messages_pending; +static atomic_t processqueue_count; static LIST_HEAD(processqueue); bool dlm_lowcomms_is_running(void) @@ -863,7 +866,6 @@ struct dlm_processed_nodes { static void process_dlm_messages(struct work_struct *work) { struct processqueue_entry *pentry; - LIST_HEAD(processed_nodes); spin_lock(&processqueue_lock); pentry = list_first_entry_or_null(&processqueue, @@ -875,6 +877,7 @@ static void process_dlm_messages(struct work_struct *work) } list_del(&pentry->list); + atomic_dec(&processqueue_count); spin_unlock(&processqueue_lock); for (;;) { @@ -892,6 +895,7 @@ static void process_dlm_messages(struct work_struct *work) } list_del(&pentry->list); + atomic_dec(&processqueue_count); spin_unlock(&processqueue_lock); } } @@ -963,6 +967,7 @@ again: con->rx_leftover); spin_lock(&processqueue_lock); + ret = atomic_inc_return(&processqueue_count); list_add_tail(&pentry->list, &processqueue); if (!process_dlm_messages_pending) { process_dlm_messages_pending = true; @@ -970,6 +975,9 @@ again: } spin_unlock(&processqueue_lock); + if (ret > DLM_MAX_PROCESS_BUFFERS) + return DLM_IO_FLUSH; + return DLM_IO_SUCCESS; } @@ -1504,6 +1512,9 @@ static void process_recv_sockets(struct work_struct *work) wake_up(&con->shutdown_wait); /* CF_RECV_PENDING cleared */ break; + case DLM_IO_FLUSH: + flush_workqueue(process_workqueue); + fallthrough; case DLM_IO_RESCHED: cond_resched(); queue_work(io_workqueue, &con->rwork); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 77d202e4a02a..be7909ead71b 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -18,7 +18,7 @@ #include "midcomms.h" #include "lowcomms.h" -int dlm_slots_version(struct dlm_header *h) +int dlm_slots_version(const struct dlm_header *h) { if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS) return 0; @@ -393,14 +393,9 @@ static void remove_remote_member(int nodeid) dlm_midcomms_remove_member(nodeid); } -static void clear_members_cb(int nodeid) -{ - remove_remote_member(nodeid); -} - void dlm_clear_members(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes, clear_members_cb); + clear_memb_list(&ls->ls_nodes, remove_remote_member); ls->ls_num_nodes = 0; } @@ -454,7 +449,7 @@ static void make_member_array(struct dlm_ls *ls) /* send a status request to all members just to establish comms connections */ -static int ping_members(struct dlm_ls *ls) +static int ping_members(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; int error = 0; @@ -464,7 +459,7 @@ static int ping_members(struct dlm_ls *ls) error = -EINTR; break; } - error = dlm_rcom_status(ls, memb->nodeid, 0); + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); if (error) break; } @@ -612,7 +607,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) make_member_array(ls); *neg_out = neg; - error = ping_members(ls); + error = ping_members(ls, rv->seq); log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; } diff --git a/fs/dlm/member.h b/fs/dlm/member.h index 433b2fac9f4a..f61cfde46314 100644 --- a/fs/dlm/member.h +++ b/fs/dlm/member.h @@ -18,7 +18,7 @@ void dlm_clear_members_gone(struct dlm_ls *ls); int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); int dlm_is_removed(struct dlm_ls *ls, int nodeid); int dlm_is_member(struct dlm_ls *ls, int nodeid); -int dlm_slots_version(struct dlm_header *h); +int dlm_slots_version(const struct dlm_header *h); void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc, struct dlm_member *memb); void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc); diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e1a0df67b566..2247ebb61be1 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -330,18 +330,31 @@ static void midcomms_node_reset(struct midcomms_node *node) wake_up(&node->shutdown_wait); } -static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) +static struct midcomms_node *nodeid2node(int nodeid) { - struct midcomms_node *node, *tmp; - int r = nodeid_hash(nodeid); + return __find_node(nodeid, nodeid_hash(nodeid)); +} + +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) +{ + int ret, idx, r = nodeid_hash(nodeid); + struct midcomms_node *node; + ret = dlm_lowcomms_addr(nodeid, addr, len); + if (ret) + return ret; + + idx = srcu_read_lock(&nodes_srcu); node = __find_node(nodeid, r); - if (node || !alloc) - return node; + if (node) { + srcu_read_unlock(&nodes_srcu, idx); + return 0; + } + srcu_read_unlock(&nodes_srcu, idx); - node = kmalloc(sizeof(*node), alloc); + node = kmalloc(sizeof(*node), GFP_NOFS); if (!node) - return NULL; + return -ENOMEM; node->nodeid = nodeid; spin_lock_init(&node->state_lock); @@ -353,21 +366,11 @@ static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) midcomms_node_reset(node); spin_lock(&nodes_lock); - /* check again if there was somebody else - * earlier here to add the node - */ - tmp = __find_node(nodeid, r); - if (tmp) { - spin_unlock(&nodes_lock); - kfree(node); - return tmp; - } - hlist_add_head_rcu(&node->hlist, &node_hash[r]); spin_unlock(&nodes_lock); node->debugfs = dlm_create_debug_comms_file(nodeid, node); - return node; + return 0; } static int dlm_send_ack(int nodeid, uint32_t seq) @@ -499,7 +502,8 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) spin_unlock(&node->state_lock); } -static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) +static void dlm_receive_buffer_3_2_trace(uint32_t seq, + const union dlm_packet *p) { switch (p->header.h_cmd) { case DLM_MSG: @@ -513,7 +517,7 @@ static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p) } } -static void dlm_midcomms_receive_buffer(union dlm_packet *p, +static void dlm_midcomms_receive_buffer(const union dlm_packet *p, struct midcomms_node *node, uint32_t seq) { @@ -602,113 +606,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, } } -static struct midcomms_node * -dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, - uint16_t msglen, int (*cb)(struct midcomms_node *node)) -{ - struct midcomms_node *node = NULL; - gfp_t allocation = 0; - int ret; - - switch (p->header.h_cmd) { - case DLM_RCOM: - if (msglen < sizeof(struct dlm_rcom)) { - log_print("rcom msg too small: %u, will skip this message from node %d", - msglen, nodeid); - return NULL; - } - - switch (p->rcom.rc_type) { - case cpu_to_le32(DLM_RCOM_NAMES): - fallthrough; - case cpu_to_le32(DLM_RCOM_NAMES_REPLY): - fallthrough; - case cpu_to_le32(DLM_RCOM_STATUS): - fallthrough; - case cpu_to_le32(DLM_RCOM_STATUS_REPLY): - node = nodeid2node(nodeid, 0); - if (node) { - spin_lock(&node->state_lock); - if (node->state != DLM_ESTABLISHED) - pr_debug("receive begin RCOM msg from node %d with state %s\n", - node->nodeid, dlm_state_str(node->state)); - - switch (node->state) { - case DLM_CLOSED: - node->state = DLM_ESTABLISHED; - pr_debug("switch node %d to state %s\n", - node->nodeid, dlm_state_str(node->state)); - break; - case DLM_ESTABLISHED: - break; - default: - spin_unlock(&node->state_lock); - return NULL; - } - spin_unlock(&node->state_lock); - } - - allocation = GFP_NOFS; - break; - default: - break; - } - - break; - default: - break; - } - - node = nodeid2node(nodeid, allocation); - if (!node) { - switch (p->header.h_cmd) { - case DLM_OPTS: - if (msglen < sizeof(struct dlm_opts)) { - log_print("opts msg too small: %u, will skip this message from node %d", - msglen, nodeid); - return NULL; - } - - log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence", - p->opts.o_nextcmd, nodeid); - break; - default: - log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence", - p->header.h_cmd, nodeid); - break; - } - - return NULL; - } - - ret = cb(node); - if (ret < 0) - return NULL; - - return node; -} - -static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) -{ - switch (node->version) { - case DLM_VERSION_NOT_SET: - node->version = DLM_VERSION_3_2; - wake_up(&node->shutdown_wait); - log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, - node->nodeid); - break; - case DLM_VERSION_3_2: - break; - default: - log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", - DLM_VERSION_3_2, node->nodeid, node->version); - return -1; - } - - return 0; -} - -static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid) +static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen, + int nodeid) { int len = msglen; @@ -757,7 +656,7 @@ static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodei return 0; } -static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) +static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid) { uint16_t msglen = le16_to_cpu(p->header.h_length); struct midcomms_node *node; @@ -765,10 +664,37 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) int ret, idx; idx = srcu_read_lock(&nodes_srcu); - node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, - dlm_midcomms_version_check_3_2); - if (!node) + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) + goto out; + + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_2; + wake_up(&node->shutdown_wait); + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, + node->nodeid); + + spin_lock(&node->state_lock); + switch (node->state) { + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + break; + } + spin_unlock(&node->state_lock); + + break; + case DLM_VERSION_3_2: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_2, node->nodeid, node->version); goto out; + } switch (p->header.h_cmd) { case DLM_RCOM: @@ -858,8 +784,19 @@ out: srcu_read_unlock(&nodes_srcu, idx); } -static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) +static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid) { + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + switch (node->version) { case DLM_VERSION_NOT_SET: node->version = DLM_VERSION_3_1; @@ -872,22 +809,6 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) default: log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", DLM_VERSION_3_1, node->nodeid, node->version); - return -1; - } - - return 0; -} - -static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) -{ - uint16_t msglen = le16_to_cpu(p->header.h_length); - struct midcomms_node *node; - int idx; - - idx = srcu_read_lock(&nodes_srcu); - node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, - dlm_midcomms_version_check_3_1); - if (!node) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -977,10 +898,10 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) switch (hd->h_version) { case cpu_to_le32(DLM_VERSION_3_1): - dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); + dlm_midcomms_receive_buffer_3_1((const union dlm_packet *)ptr, nodeid); break; case cpu_to_le32(DLM_VERSION_3_2): - dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); + dlm_midcomms_receive_buffer_3_2((const union dlm_packet *)ptr, nodeid); break; default: log_print("received invalid version header: %u from node %d, will skip this message", @@ -1003,8 +924,8 @@ void dlm_midcomms_unack_msg_resend(int nodeid) int idx, ret; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1090,11 +1011,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { - WARN_ON_ONCE(1); + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) goto err; - } /* this is a bug, however we going on and hope it will be resolved */ WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); @@ -1119,15 +1038,15 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, break; case DLM_VERSION_3_2: + /* send ack back if necessary */ + dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD); + msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, ppc); if (!msg) { dlm_free_mhandle(mh); goto err; } - - /* send ack back if necessary */ - dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD); break; default: dlm_free_mhandle(mh); @@ -1235,8 +1154,34 @@ void dlm_midcomms_init(void) dlm_lowcomms_init(); } +static void midcomms_node_release(struct rcu_head *rcu) +{ + struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); + + WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); + dlm_send_queue_flush(node); + kfree(node); +} + void dlm_midcomms_exit(void) { + struct midcomms_node *node; + int i, idx; + + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + } + } + srcu_read_unlock(&nodes_srcu, idx); + dlm_lowcomms_exit(); } @@ -1277,8 +1222,8 @@ void dlm_midcomms_add_member(int nodeid) int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, GFP_NOFS); - if (!node) { + node = nodeid2node(nodeid); + if (WARN_ON_ONCE(!node)) { srcu_read_unlock(&nodes_srcu, idx); return; } @@ -1322,13 +1267,24 @@ void dlm_midcomms_remove_member(int nodeid) int idx; idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); + node = nodeid2node(nodeid); + /* in case of dlm_midcomms_close() removes node */ if (!node) { srcu_read_unlock(&nodes_srcu, idx); return; } spin_lock(&node->state_lock); + /* case of dlm_midcomms_addr() created node but + * was not added before because dlm_midcomms_close() + * removed the node + */ + if (!node->users) { + spin_unlock(&node->state_lock); + srcu_read_unlock(&nodes_srcu, idx); + return; + } + node->users--; pr_debug("node %d users dec count %d\n", nodeid, node->users); @@ -1367,15 +1323,6 @@ void dlm_midcomms_remove_member(int nodeid) srcu_read_unlock(&nodes_srcu, idx); } -static void midcomms_node_release(struct rcu_head *rcu) -{ - struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); - - WARN_ON_ONCE(atomic_read(&node->send_queue_cnt)); - dlm_send_queue_flush(node); - kfree(node); -} - void dlm_midcomms_version_wait(void) { struct midcomms_node *node; @@ -1438,7 +1385,7 @@ static void midcomms_shutdown(struct midcomms_node *node) node->state == DLM_CLOSED || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), DLM_SHUTDOWN_TIMEOUT); - if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) + if (!ret) pr_debug("active shutdown timed out for node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); else @@ -1456,20 +1403,18 @@ void dlm_midcomms_shutdown(void) for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { midcomms_shutdown(node); + } + } - dlm_delete_debug_comms_file(node->debugfs); - - spin_lock(&nodes_lock); - hlist_del_rcu(&node->hlist); - spin_unlock(&nodes_lock); + dlm_lowcomms_shutdown(); - call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + midcomms_node_reset(node); } } srcu_read_unlock(&nodes_srcu, idx); mutex_unlock(&close_lock); - - dlm_lowcomms_shutdown(); } int dlm_midcomms_close(int nodeid) @@ -1479,7 +1424,7 @@ int dlm_midcomms_close(int nodeid) idx = srcu_read_lock(&nodes_srcu); /* Abort pending close/remove operation */ - node = nodeid2node(nodeid, 0); + node = nodeid2node(nodeid); if (node) { /* let shutdown waiters leave */ set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); @@ -1489,20 +1434,32 @@ int dlm_midcomms_close(int nodeid) synchronize_srcu(&nodes_srcu); - idx = srcu_read_lock(&nodes_srcu); mutex_lock(&close_lock); - node = nodeid2node(nodeid, 0); + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid); if (!node) { - mutex_unlock(&close_lock); srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); return dlm_lowcomms_close(nodeid); } ret = dlm_lowcomms_close(nodeid); - spin_lock(&node->state_lock); - midcomms_node_reset(node); - spin_unlock(&node->state_lock); + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); srcu_read_unlock(&nodes_srcu, idx); + + /* wait that all readers left until flush send queue */ + synchronize_srcu(&nodes_srcu); + + /* drop all pending dlm messages, this is fine as + * this function get called when the node is fenced + */ + dlm_send_queue_flush(node); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); mutex_unlock(&close_lock); return ret; diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 9f8c9605013d..e7246fb3ef57 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name, int namelen); +int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); void dlm_midcomms_version_wait(void); int dlm_midcomms_close(int nodeid); int dlm_midcomms_start(void); diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 70a4752ed913..e6b4c1a21446 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -11,6 +11,8 @@ #include <linux/dlm_plock.h> #include <linux/slab.h> +#include <trace/events/dlm.h> + #include "dlm_internal.h" #include "lockspace.h" @@ -42,6 +44,27 @@ static inline void set_version(struct dlm_plock_info *info) info->version[2] = DLM_PLOCK_VERSION_PATCH; } +static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info) +{ + struct plock_op *op = NULL, *iter; + + list_for_each_entry(iter, &recv_list, list) { + if (iter->info.fsid == info->fsid && + iter->info.number == info->number && + iter->info.owner == info->owner && + iter->info.pid == info->pid && + iter->info.start == info->start && + iter->info.end == info->end && + iter->info.ex == info->ex && + iter->info.wait) { + op = iter; + break; + } + } + + return op; +} + static int check_version(struct dlm_plock_info *info) { if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) || @@ -74,30 +97,26 @@ static void send_op(struct plock_op *op) wake_up(&send_wq); } -/* If a process was killed while waiting for the only plock on a file, - locks_remove_posix will not see any lock on the file so it won't - send an unlock-close to us to pass on to userspace to clean up the - abandoned waiter. So, we have to insert the unlock-close when the - lock call is interrupted. */ - -static void do_unlock_close(const struct dlm_plock_info *info) +static int do_lock_cancel(const struct dlm_plock_info *orig_info) { struct plock_op *op; + int rv; op = kzalloc(sizeof(*op), GFP_NOFS); if (!op) - return; + return -ENOMEM; + + op->info = *orig_info; + op->info.optype = DLM_PLOCK_OP_CANCEL; + op->info.wait = 0; - op->info.optype = DLM_PLOCK_OP_UNLOCK; - op->info.pid = info->pid; - op->info.fsid = info->fsid; - op->info.number = info->number; - op->info.start = 0; - op->info.end = OFFSET_MAX; - op->info.owner = info->owner; - - op->info.flags |= DLM_PLOCK_FL_CLOSE; send_op(op); + wait_event(recv_wq, (op->done != 0)); + + rv = op->info.rv; + + dlm_release_plock_op(op); + return rv; } int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, @@ -156,7 +175,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); if (op->info.wait) { - rv = wait_event_killable(recv_wq, (op->done != 0)); + rv = wait_event_interruptible(recv_wq, (op->done != 0)); if (rv == -ERESTARTSYS) { spin_lock(&ops_lock); /* recheck under ops_lock if we got a done != 0, @@ -166,17 +185,37 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, spin_unlock(&ops_lock); goto do_lock_wait; } - list_del(&op->list); spin_unlock(&ops_lock); + rv = do_lock_cancel(&op->info); + switch (rv) { + case 0: + /* waiter was deleted in user space, answer will never come + * remove original request. The original request must be + * on recv_list because the answer of do_lock_cancel() + * synchronized it. + */ + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + rv = -EINTR; + break; + case -ENOENT: + /* cancellation wasn't successful but op should be done */ + fallthrough; + default: + /* internal error doing cancel we need to wait */ + goto wait; + } + log_debug(ls, "%s: wait interrupted %x %llx pid %d", __func__, ls->ls_global_id, (unsigned long long)number, op->info.pid); - do_unlock_close(&op->info); dlm_release_plock_op(op); goto out; } } else { +wait: wait_event(recv_wq, (op->done != 0)); } @@ -240,8 +279,8 @@ static int dlm_plock_callback(struct plock_op *op) rv = notify(fl, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ - log_print("dlm_plock_callback: lock granted after lock request " - "failed; dangling lock!\n"); + log_print("%s: lock granted after lock request failed; dangling lock!", + __func__); goto out; } @@ -318,6 +357,75 @@ out: } EXPORT_SYMBOL_GPL(dlm_posix_unlock); +/* + * NOTE: This implementation can only handle async lock requests as nfs + * do it. It cannot handle cancellation of a pending lock request sitting + * in wait_event(), but for now only nfs is the only user local kernel + * user. + */ +int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file, + struct file_lock *fl) +{ + struct dlm_plock_info info; + struct plock_op *op; + struct dlm_ls *ls; + int rv; + + /* this only works for async request for now and nfs is the only + * kernel user right now. + */ + if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant)) + return -EOPNOTSUPP; + + ls = dlm_find_lockspace_local(lockspace); + if (!ls) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + info.pid = fl->fl_pid; + info.ex = (fl->fl_type == F_WRLCK); + info.fsid = ls->ls_global_id; + dlm_put_lockspace(ls); + info.number = number; + info.start = fl->fl_start; + info.end = fl->fl_end; + info.owner = (__u64)fl->fl_pid; + + rv = do_lock_cancel(&info); + switch (rv) { + case 0: + spin_lock(&ops_lock); + /* lock request to cancel must be on recv_list because + * do_lock_cancel() synchronizes it. + */ + op = plock_lookup_waiter(&info); + if (WARN_ON_ONCE(!op)) { + spin_unlock(&ops_lock); + rv = -ENOLCK; + break; + } + + list_del(&op->list); + spin_unlock(&ops_lock); + WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); + op->data->callback(op->data->fl, -EINTR); + dlm_release_plock_op(op); + rv = -EINTR; + break; + case -ENOENT: + /* if cancel wasn't successful we probably were to late + * or it was a non-blocking lock request, so just unlock it. + */ + rv = dlm_posix_unlock(lockspace, number, file, fl); + break; + default: + break; + } + + return rv; +} +EXPORT_SYMBOL_GPL(dlm_posix_cancel); + int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, struct file_lock *fl) { @@ -403,6 +511,8 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, if (!op) return -EAGAIN; + trace_dlm_plock_read(&info); + /* there is no need to get a reply from userspace for unlocks that were generated by the vfs cleaning up for a close (the process did not make an unlock call). */ @@ -430,6 +540,8 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (copy_from_user(&info, u, sizeof(info))) return -EFAULT; + trace_dlm_plock_write(&info); + if (check_version(&info)) return -EINVAL; @@ -441,22 +553,11 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, */ spin_lock(&ops_lock); if (info.wait) { - list_for_each_entry(iter, &recv_list, list) { - if (iter->info.fsid == info.fsid && - iter->info.number == info.number && - iter->info.owner == info.owner && - iter->info.pid == info.pid && - iter->info.start == info.start && - iter->info.end == info.end && - iter->info.ex == info.ex && - iter->info.wait) { - op = iter; - break; - } - } + op = plock_lookup_waiter(&info); } else { list_for_each_entry(iter, &recv_list, list) { - if (!iter->info.wait) { + if (!iter->info.wait && + iter->info.fsid == info.fsid) { op = iter; break; } @@ -468,8 +569,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (info.wait) WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); else - WARN_ON(op->info.fsid != info.fsid || - op->info.number != info.number || + WARN_ON(op->info.number != info.number || op->info.owner != info.owner || op->info.optype != info.optype); @@ -534,5 +634,7 @@ int dlm_plock_init(void) void dlm_plock_exit(void) { misc_deregister(&plock_dev_misc); + WARN_ON(!list_empty(&send_list)); + WARN_ON(!list_empty(&recv_list)); } diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f4afdf892f78..3b734aed26b5 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -28,7 +28,8 @@ static int rcom_response(struct dlm_ls *ls) } static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, char *mb, int mb_len) + struct dlm_rcom **rc_ret, char *mb, int mb_len, + uint64_t seq) { struct dlm_rcom *rc; @@ -41,16 +42,14 @@ static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_header.h_cmd = DLM_RCOM; rc->rc_type = cpu_to_le32(type); - - spin_lock(&ls->ls_recover_lock); - rc->rc_seq = cpu_to_le64(ls->ls_recover_seq); - spin_unlock(&ls->ls_recover_lock); + rc->rc_seq = cpu_to_le64(seq); *rc_ret = rc; } static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret, + uint64_t seq) { int mb_len = sizeof(struct dlm_rcom) + len; struct dlm_mhandle *mh; @@ -63,14 +62,14 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, return -ENOBUFS; } - _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); *mh_ret = mh; return 0; } static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, int len, struct dlm_rcom **rc_ret, - struct dlm_msg **msg_ret) + struct dlm_msg **msg_ret, uint64_t seq) { int mb_len = sizeof(struct dlm_rcom) + len; struct dlm_msg *msg; @@ -84,7 +83,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, return -ENOBUFS; } - _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq); *msg_ret = msg; return 0; } @@ -170,7 +169,8 @@ static void disallow_sync_reply(struct dlm_ls *ls) * node's rcom_config. */ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq) { struct dlm_rcom *rc; struct dlm_msg *msg; @@ -186,7 +186,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) retry: error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS, - sizeof(struct rcom_status), &rc, &msg); + sizeof(struct rcom_status), &rc, &msg, + seq); if (error) goto out; @@ -220,7 +221,9 @@ retry: return error; } -static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_status(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, + uint64_t seq) { struct dlm_rcom *rc; struct rcom_status *rs; @@ -251,7 +254,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) do_create: error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY, - len, &rc, &msg); + len, &rc, &msg, seq); if (error) return; @@ -281,7 +284,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom_stateless(msg, rc); } -static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_sync_reply(struct dlm_ls *ls, const struct dlm_rcom *rc_in) { spin_lock(&ls->ls_rcom_spin); if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) || @@ -302,17 +305,18 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_rcom_spin); } -int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq) { + struct dlm_mhandle *mh; struct dlm_rcom *rc; - struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; retry: - error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len, - &rc, &msg); + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, + &rc, &mh, seq); if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); @@ -320,7 +324,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(msg, rc); + send_rcom(mh, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -330,19 +334,20 @@ retry: return error; } -static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_names(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) { + struct dlm_mhandle *mh; struct dlm_rcom *rc; int error, inlen, outlen, nodeid; - struct dlm_msg *msg; nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); inlen = le16_to_cpu(rc_in->rc_header.h_length) - sizeof(struct dlm_rcom); outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); - error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, - &rc, &msg); + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, + &rc, &mh, seq); if (error) return; rc->rc_id = rc_in->rc_id; @@ -350,10 +355,10 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom_stateless(msg, rc); + send_rcom(mh, rc); } -int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -361,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) int error; error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length, - &rc, &mh); + &rc, &mh, seq); if (error) goto out; memcpy(rc->rc_buf, r->res_name, r->res_length); @@ -372,7 +377,8 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) return error; } -static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lookup(struct dlm_ls *ls, + const struct dlm_rcom *rc_in, uint64_t seq) { struct dlm_rcom *rc; struct dlm_mhandle *mh; @@ -387,7 +393,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) return; } - error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh, + seq); if (error) return; @@ -402,7 +409,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) send_rcom(mh, rc); } -static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lookup_reply(struct dlm_ls *ls, + const struct dlm_rcom *rc_in) { dlm_recover_master_reply(ls, rc_in); } @@ -437,7 +445,7 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); } -int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq) { struct dlm_ls *ls = r->res_ls; struct dlm_rcom *rc; @@ -448,7 +456,8 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) if (lkb->lkb_lvbptr) len += ls->ls_lvblen; - error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh); + error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh, + seq); if (error) goto out; @@ -462,23 +471,28 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) } /* needs at least dlm_rcom + rcom_lock */ -static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) +static void receive_rcom_lock(struct dlm_ls *ls, const struct dlm_rcom *rc_in, + uint64_t seq) { + __le32 rl_remid, rl_result; + struct rcom_lock *rl; struct dlm_rcom *rc; struct dlm_mhandle *mh; int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid); - dlm_recover_master_copy(ls, rc_in); + dlm_recover_master_copy(ls, rc_in, &rl_remid, &rl_result); error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY, - sizeof(struct rcom_lock), &rc, &mh); + sizeof(struct rcom_lock), &rc, &mh, seq); if (error) return; - /* We send back the same rcom_lock struct we received, but - dlm_recover_master_copy() has filled in rl_remid and rl_result */ - memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); + rl = (struct rcom_lock *)rc->rc_buf; + /* set rl_remid and rl_result from dlm_recover_master_copy() */ + rl->rl_remid = rl_remid; + rl->rl_result = rl_result; + rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; @@ -488,7 +502,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) /* If the lockspace doesn't exist then still send a status message back; it's possible that it just doesn't have its global_id yet. */ -int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct rcom_config *rf; @@ -566,7 +580,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) /* Called by dlm_recv; corresponds to dlm_receive_message() but special recovery-only comms are sent through here. */ -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, int nodeid) { int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock); int stop, reply = 0, names = 0, lookup = 0, lock = 0; @@ -620,21 +634,21 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) switch (rc->rc_type) { case cpu_to_le32(DLM_RCOM_STATUS): - receive_rcom_status(ls, rc); + receive_rcom_status(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_NAMES): - receive_rcom_names(ls, rc); + receive_rcom_names(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_LOOKUP): - receive_rcom_lookup(ls, rc); + receive_rcom_lookup(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_LOCK): if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; - receive_rcom_lock(ls, rc); + receive_rcom_lock(ls, rc, seq); break; case cpu_to_le32(DLM_RCOM_STATUS_REPLY): @@ -652,7 +666,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) case cpu_to_le32(DLM_RCOM_LOCK_REPLY): if (le16_to_cpu(rc->rc_header.h_length) < lock_size) goto Eshort; - dlm_recover_process_copy(ls, rc); + dlm_recover_process_copy(ls, rc, seq); break; default: diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h index 454d3c4814ab..765926ae0020 100644 --- a/fs/dlm/rcom.h +++ b/fs/dlm/rcom.h @@ -12,12 +12,15 @@ #ifndef __RCOM_DOT_H__ #define __RCOM_DOT_H__ -int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags); -int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); -int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); -int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); -void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid); -int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in); +int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags, + uint64_t seq); +int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, + int last_len, uint64_t seq); +int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq); +int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq); +void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, + int nodeid); +int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in); #endif diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c index 29d71a5018d4..53917c0aa3c0 100644 --- a/fs/dlm/recover.c +++ b/fs/dlm/recover.c @@ -93,7 +93,7 @@ void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) } static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, - int save_slots) + int save_slots, uint64_t seq) { struct dlm_rcom *rc = ls->ls_recover_buf; struct dlm_member *memb; @@ -107,7 +107,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, goto out; } - error = dlm_rcom_status(ls, memb->nodeid, 0); + error = dlm_rcom_status(ls, memb->nodeid, 0, seq); if (error) goto out; @@ -126,7 +126,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status, } static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, - uint32_t status_flags) + uint32_t status_flags, uint64_t seq) { struct dlm_rcom *rc = ls->ls_recover_buf; int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; @@ -137,7 +137,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, goto out; } - error = dlm_rcom_status(ls, nodeid, status_flags); + error = dlm_rcom_status(ls, nodeid, status_flags, seq); if (error) break; @@ -151,22 +151,22 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status, return error; } -static int wait_status(struct dlm_ls *ls, uint32_t status) +static int wait_status(struct dlm_ls *ls, uint32_t status, uint64_t seq) { uint32_t status_all = status << 1; int error; if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, status, 0); + error = wait_status_all(ls, status, 0, seq); if (!error) dlm_set_recover_status(ls, status_all); } else - error = wait_status_low(ls, status_all, 0); + error = wait_status_low(ls, status_all, 0, seq); return error; } -int dlm_recover_members_wait(struct dlm_ls *ls) +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq) { struct dlm_member *memb; struct dlm_slot *slots; @@ -180,7 +180,7 @@ int dlm_recover_members_wait(struct dlm_ls *ls) } if (ls->ls_low_nodeid == dlm_our_nodeid()) { - error = wait_status_all(ls, DLM_RS_NODES, 1); + error = wait_status_all(ls, DLM_RS_NODES, 1, seq); if (error) goto out; @@ -199,7 +199,8 @@ int dlm_recover_members_wait(struct dlm_ls *ls) dlm_set_recover_status(ls, DLM_RS_NODES_ALL); } } else { - error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS); + error = wait_status_low(ls, DLM_RS_NODES_ALL, + DLM_RSF_NEED_SLOTS, seq); if (error) goto out; @@ -209,19 +210,19 @@ int dlm_recover_members_wait(struct dlm_ls *ls) return error; } -int dlm_recover_directory_wait(struct dlm_ls *ls) +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_DIR); + return wait_status(ls, DLM_RS_DIR, seq); } -int dlm_recover_locks_wait(struct dlm_ls *ls) +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_LOCKS); + return wait_status(ls, DLM_RS_LOCKS, seq); } -int dlm_recover_done_wait(struct dlm_ls *ls) +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq) { - return wait_status(ls, DLM_RS_DONE); + return wait_status(ls, DLM_RS_DONE, seq); } /* @@ -441,7 +442,7 @@ static void set_new_master(struct dlm_rsb *r) * equals our_nodeid below). */ -static int recover_master(struct dlm_rsb *r, unsigned int *count) +static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq) { struct dlm_ls *ls = r->res_ls; int our_nodeid, dir_nodeid; @@ -472,7 +473,7 @@ static int recover_master(struct dlm_rsb *r, unsigned int *count) error = 0; } else { recover_idr_add(r); - error = dlm_send_rcom_lookup(r, dir_nodeid); + error = dlm_send_rcom_lookup(r, dir_nodeid, seq); } (*count)++; @@ -520,7 +521,7 @@ static int recover_master_static(struct dlm_rsb *r, unsigned int *count) * the correct dir node. */ -int dlm_recover_masters(struct dlm_ls *ls) +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq) { struct dlm_rsb *r; unsigned int total = 0; @@ -542,7 +543,7 @@ int dlm_recover_masters(struct dlm_ls *ls) if (nodir) error = recover_master_static(r, &count); else - error = recover_master(r, &count); + error = recover_master(r, &count, seq); unlock_rsb(r); cond_resched(); total++; @@ -563,7 +564,7 @@ int dlm_recover_masters(struct dlm_ls *ls) return error; } -int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc) { struct dlm_rsb *r; int ret_nodeid, new_master; @@ -614,13 +615,14 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) * an equal number of replies then recovery for the rsb is done */ -static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) +static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head, + uint64_t seq) { struct dlm_lkb *lkb; int error = 0; list_for_each_entry(lkb, head, lkb_statequeue) { - error = dlm_send_rcom_lock(r, lkb); + error = dlm_send_rcom_lock(r, lkb, seq); if (error) break; r->res_recover_locks_count++; @@ -629,7 +631,7 @@ static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) return error; } -static int recover_locks(struct dlm_rsb *r) +static int recover_locks(struct dlm_rsb *r, uint64_t seq) { int error = 0; @@ -637,13 +639,13 @@ static int recover_locks(struct dlm_rsb *r) DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r);); - error = recover_locks_queue(r, &r->res_grantqueue); + error = recover_locks_queue(r, &r->res_grantqueue, seq); if (error) goto out; - error = recover_locks_queue(r, &r->res_convertqueue); + error = recover_locks_queue(r, &r->res_convertqueue, seq); if (error) goto out; - error = recover_locks_queue(r, &r->res_waitqueue); + error = recover_locks_queue(r, &r->res_waitqueue, seq); if (error) goto out; @@ -656,7 +658,7 @@ static int recover_locks(struct dlm_rsb *r) return error; } -int dlm_recover_locks(struct dlm_ls *ls) +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq) { struct dlm_rsb *r; int error, count = 0; @@ -677,7 +679,7 @@ int dlm_recover_locks(struct dlm_ls *ls) goto out; } - error = recover_locks(r); + error = recover_locks(r, seq); if (error) { up_read(&ls->ls_root_sem); goto out; diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h index 235e0d25cd48..dbc51013ecad 100644 --- a/fs/dlm/recover.h +++ b/fs/dlm/recover.h @@ -15,13 +15,13 @@ int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)); uint32_t dlm_recover_status(struct dlm_ls *ls); void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status); -int dlm_recover_members_wait(struct dlm_ls *ls); -int dlm_recover_directory_wait(struct dlm_ls *ls); -int dlm_recover_locks_wait(struct dlm_ls *ls); -int dlm_recover_done_wait(struct dlm_ls *ls); -int dlm_recover_masters(struct dlm_ls *ls); -int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc); -int dlm_recover_locks(struct dlm_ls *ls); +int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq); +int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc); +int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq); void dlm_recovered_lock(struct dlm_rsb *r); int dlm_create_root_list(struct dlm_ls *ls); void dlm_release_root_list(struct dlm_ls *ls); diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 19da816cfb09..4d17491dea2f 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_NODES); - error = dlm_recover_members_wait(ls); + error = dlm_recover_members_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_members_wait error %d", error); goto fail; @@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * nodes their master rsb names that hash to us. */ - error = dlm_recover_directory(ls); + error = dlm_recover_directory(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_directory error %d", error); goto fail; @@ -111,7 +111,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DIR); - error = dlm_recover_directory_wait(ls); + error = dlm_recover_directory_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_directory_wait error %d", error); goto fail; @@ -145,7 +145,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * departed nodes. */ - error = dlm_recover_masters(ls); + error = dlm_recover_masters(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_masters error %d", error); goto fail; @@ -155,7 +155,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) * Send our locks on remastered rsb's to the new masters. */ - error = dlm_recover_locks(ls); + error = dlm_recover_locks(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks error %d", error); goto fail; @@ -163,7 +163,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_LOCKS); - error = dlm_recover_locks_wait(ls); + error = dlm_recover_locks_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; @@ -187,7 +187,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) */ dlm_set_recover_status(ls, DLM_RS_LOCKS); - error = dlm_recover_locks_wait(ls); + error = dlm_recover_locks_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_locks_wait error %d", error); goto fail; @@ -206,7 +206,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_set_recover_status(ls, DLM_RS_DONE); - error = dlm_recover_done_wait(ls); + error = dlm_recover_done_wait(ls, rv->seq); if (error) { log_rinfo(ls, "dlm_recover_done_wait error %d", error); goto fail; diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 8be2893ad15b..892d6ca21e74 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -30,7 +30,8 @@ struct rq_entry { * lockspace is enabled on some while still suspended on others. */ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms) +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms) { struct rq_entry *e; int length = le16_to_cpu(ms->m_header.h_length) - diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h index 4e403469a845..42bfe23ceabe 100644 --- a/fs/dlm/requestqueue.h +++ b/fs/dlm/requestqueue.h @@ -11,7 +11,8 @@ #ifndef __REQUESTQUEUE_DOT_H__ #define __REQUESTQUEUE_DOT_H__ -void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms); +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, + const struct dlm_message *ms); int dlm_process_requestqueue(struct dlm_ls *ls); void dlm_wait_requestqueue(struct dlm_ls *ls); void dlm_purge_requestqueue(struct dlm_ls *ls); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index e619c31b6bd9..b9575957a7c2 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,7 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> +#include <linux/swap.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -59,6 +60,7 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, static int stfu; if (sysctl_drop_caches & 1) { + lru_add_drain_all(); iterate_supers(drop_pagecache_sb, NULL); count_vm_event(DROP_PAGECACHE); } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c16f0d660cb7..03bd55069d86 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -441,10 +441,10 @@ int ecryptfs_encrypt_page(struct page *page) } lower_offset = lower_offset_for_page(crypt_stat, page); - enc_extent_virt = kmap(enc_extent_page); + enc_extent_virt = kmap_local_page(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, PAGE_SIZE); - kunmap(enc_extent_page); + kunmap_local(enc_extent_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to write lower page; rc = [%d]\n", @@ -490,10 +490,10 @@ int ecryptfs_decrypt_page(struct page *page) BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); lower_offset = lower_offset_for_page(crypt_stat, page); - page_virt = kmap(page); + page_virt = kmap_local_page(page); rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); - kunmap(page); + kunmap_local(page_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to read lower page; rc = [%d]\n", diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index f2ed0c0266cb..c586c5db18b5 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -702,6 +702,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, loff_t offset); -extern const struct xattr_handler *ecryptfs_xattr_handlers[]; +extern const struct xattr_handler * const ecryptfs_xattr_handlers[]; #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 83274915ba6d..b0e8774c435a 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -148,7 +148,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, } fsstack_copy_attr_times(dir, lower_dir); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); out_unlock: dput(lower_dentry); inode_unlock(lower_dir); @@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -998,6 +998,14 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, return rc; } +static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + if (flags & AT_GETATTR_NOSEC) + return vfs_getattr_nosec(path, stat, request_mask, flags); + return vfs_getattr(path, stat, request_mask, flags); +} + static int ecryptfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) @@ -1006,12 +1014,13 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, struct kstat lower_stat; int rc; - rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat, - request_mask, flags); + rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry), + &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, + d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1209,7 +1218,7 @@ static const struct xattr_handler ecryptfs_xattr_handler = { .set = ecryptfs_xattr_set, }; -const struct xattr_handler *ecryptfs_xattr_handlers[] = { +const struct xattr_handler * const ecryptfs_xattr_handlers[] = { &ecryptfs_xattr_handler, NULL }; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 373c3e5747e6..e2483acc4366 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -125,7 +125,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_atomic(page); + page_virt = kmap_local_page(page); memset(page_virt, 0, PAGE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { @@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, crypt_stat, &written); } - kunmap_atomic(page_virt); + kunmap_local(page_virt); flush_dcache_page(page); if (rc) { printk(KERN_ERR "%s: Error reading xattr " @@ -255,7 +255,6 @@ out: * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write - * @flags: Various flags * @pagep: Pointer to return the page * @fsdata: Pointer to return fs data (unused) * diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 60bdcaddcbe5..3458f153a588 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_lower); + virt = kmap_local_page(page_for_lower); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); if (rc > 0) rc = 0; - kunmap(page_for_lower); + kunmap_local(virt); return rc; } @@ -140,7 +140,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_atomic(ecryptfs_page); + ecryptfs_page_virt = kmap_local_page(ecryptfs_page); /* * pos: where we're now writing, offset: where the request was @@ -163,7 +163,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, (data + data_offset), num_bytes); data_offset += num_bytes; } - kunmap_atomic(ecryptfs_page_virt); + kunmap_local(ecryptfs_page_virt); flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); @@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int rc; offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_ecryptfs); + virt = kmap_local_page(page_for_ecryptfs); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) rc = 0; - kunmap(page_for_ecryptfs); + kunmap_local(virt); flush_dcache_page(page_for_ecryptfs); return rc; } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index d57ee15874f9..7e9961639802 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file, } else { inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_unlock(inode); } diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index b973a2c03dde..91290fe4a70b 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -21,11 +21,15 @@ struct inode *efivarfs_get_inode(struct super_block *sb, dev_t dev, bool is_removable) { struct inode *inode = new_inode(sb); + struct efivarfs_fs_info *fsi = sb->s_fs_info; + struct efivarfs_mount_opts *opts = &fsi->mount_opts; if (inode) { + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index 8ebf3a6a8aa2..c66647f5c0bd 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -9,6 +9,15 @@ #include <linux/list.h> #include <linux/efi.h> +struct efivarfs_mount_opts { + kuid_t uid; + kgid_t gid; +}; + +struct efivarfs_fs_info { + struct efivarfs_mount_opts mount_opts; +}; + struct efi_variable { efi_char16_t VariableName[EFI_VAR_NAME_LEN/sizeof(efi_char16_t)]; efi_guid_t VendorGuid; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index e028fafa04f3..77240953a92e 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -8,6 +8,7 @@ #include <linux/efi.h> #include <linux/fs.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/ucs2_string.h> @@ -24,18 +25,40 @@ static void efivarfs_evict_inode(struct inode *inode) clear_inode(inode); } +static int efivarfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct efivarfs_fs_info *sbi = sb->s_fs_info; + struct efivarfs_mount_opts *opts = &sbi->mount_opts; + + if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); + if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); + return 0; +} + static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) { const u32 attr = EFI_VARIABLE_NON_VOLATILE | EFI_VARIABLE_BOOTSERVICE_ACCESS | EFI_VARIABLE_RUNTIME_ACCESS; u64 storage_space, remaining_space, max_variable_size; + u64 id = huge_encode_dev(dentry->d_sb->s_dev); efi_status_t status; - status = efivar_query_variable_info(attr, &storage_space, &remaining_space, - &max_variable_size); - if (status != EFI_SUCCESS) - return efi_status_to_err(status); + /* Some UEFI firmware does not implement QueryVariableInfo() */ + storage_space = remaining_space = 0; + if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) { + status = efivar_query_variable_info(attr, &storage_space, + &remaining_space, + &max_variable_size); + if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED) + pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n", + status); + } /* * This is not a normal filesystem, so no point in pretending it has a block @@ -47,6 +70,7 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = storage_space; buf->f_bfree = remaining_space; buf->f_type = dentry->d_sb->s_magic; + buf->f_fsid = u64_to_fsid(id); /* * In f_bavail we declare the free space that the kernel will allow writing @@ -64,6 +88,7 @@ static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, .drop_inode = generic_delete_inode, .evict_inode = efivarfs_evict_inode, + .show_options = efivarfs_show_options, }; /* @@ -225,6 +250,45 @@ static int efivarfs_destroy(struct efivar_entry *entry, void *data) return 0; } +enum { + Opt_uid, Opt_gid, +}; + +static const struct fs_parameter_spec efivarfs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + {}, +}; + +static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct efivarfs_fs_info *sbi = fc->s_fs_info; + struct efivarfs_mount_opts *opts = &sbi->mount_opts; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, efivarfs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(opts->uid)) + return -EINVAL; + break; + case Opt_gid: + opts->gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(opts->gid)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode = NULL; @@ -271,10 +335,21 @@ static int efivarfs_get_tree(struct fs_context *fc) static const struct fs_context_operations efivarfs_context_ops = { .get_tree = efivarfs_get_tree, + .parse_param = efivarfs_parse_param, }; static int efivarfs_init_fs_context(struct fs_context *fc) { + struct efivarfs_fs_info *sfi; + + sfi = kzalloc(sizeof(*sfi), GFP_KERNEL); + if (!sfi) + return -ENOMEM; + + sfi->mount_opts.uid = GLOBAL_ROOT_UID; + sfi->mount_opts.gid = GLOBAL_ROOT_GID; + + fc->s_fs_info = sfi; fc->ops = &efivarfs_context_ops; return 0; } @@ -295,6 +370,7 @@ static struct file_system_type efivarfs_type = { .name = "efivarfs", .init_fs_context = efivarfs_init_fs_context, .kill_sb = efivarfs_kill_sb, + .parameters = efivarfs_parameters, }; static __init int efivarfs_init(void) diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig index 2df1bac8b375..0833e533df9d 100644 --- a/fs/efs/Kconfig +++ b/fs/efs/Kconfig @@ -2,6 +2,7 @@ config EFS_FS tristate "EFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help EFS is an older file system used for non-ISO9660 CD-ROMs and hard disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 13a4d9622633..918d2b9abb76 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (c) 1999 Al Smith + * Copyright (c) 1999 Al Smith, <Al.Smith@aeschi.ch.eu.org> * * Portions derived from work (c) 1995,1996 Christian Vogelgsang. * Portions derived from IRIX header files (c) 1988 Silicon Graphics @@ -19,9 +19,6 @@ #define EFS_VERSION "1.0a" -static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith <Al.Smith@aeschi.ch.eu.org>"; - - /* 1 block is 512 bytes */ #define EFS_BLOCKSIZE_BITS 9 #define EFS_BLOCKSIZE (1 << EFS_BLOCKSIZE_BITS) diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 3ba94bb005a6..7844ab24b813 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -103,10 +103,9 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid)); i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid)); inode->i_size = be32_to_cpu(efs_inode->di_size); - inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); - inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); - inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode_set_atime(inode, be32_to_cpu(efs_inode->di_atime), 0); + inode_set_mtime(inode, be32_to_cpu(efs_inode->di_mtime), 0); + inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0); /* this is the number of blocks in the file */ if (inode->i_size == 0) { diff --git a/fs/efs/super.c b/fs/efs/super.c index b287f47c165b..f17fdac76b2e 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -123,6 +123,7 @@ static const struct super_operations efs_superblock_operations = { }; static const struct export_operations efs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = efs_fh_to_dentry, .fh_to_parent = efs_fh_to_parent, .get_parent = efs_get_parent, diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index f259d92c9720..1d318f85232d 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -21,7 +21,7 @@ config EROFS_FS performance under extremely memory pressure without extra cost. See the documentation at <file:Documentation/filesystems/erofs.rst> - for more details. + and the web pages at <https://erofs.docs.kernel.org> for more details. If unsure, say N. @@ -38,6 +38,7 @@ config EROFS_FS_DEBUG config EROFS_FS_XATTR bool "EROFS extended attributes" depends on EROFS_FS + select XXHASH default y help Extended attributes are name:value pairs associated with inodes by @@ -90,12 +91,24 @@ config EROFS_FS_ZIP_LZMA select XZ_DEC_MICROLZMA help Saying Y here includes support for reading EROFS file systems - containing LZMA compressed data, specifically called microLZMA. it - gives better compression ratios than the LZ4 algorithm, at the + containing LZMA compressed data, specifically called microLZMA. It + gives better compression ratios than the default LZ4 format, at the expense of more CPU overhead. - LZMA support is an experimental feature for now and so most file - systems will be readable without selecting this option. + If unsure, say N. + +config EROFS_FS_ZIP_DEFLATE + bool "EROFS DEFLATE compressed data support" + depends on EROFS_FS_ZIP + select ZLIB_INFLATE + help + Saying Y here includes support for reading EROFS file systems + containing DEFLATE compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + DEFLATE support is an experimental feature for now and so most + file systems will be readable without selecting this option. If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index a3a98fc3e481..994d0b9deddf 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -5,4 +5,5 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o +erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index b1b846504027..279933e007d2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -21,6 +21,8 @@ struct z_erofs_decompress_req { }; struct z_erofs_decompressor { + int (*config)(struct super_block *sb, struct erofs_super_block *dsb, + void *data, int size); int (*decompress)(struct z_erofs_decompress_req *rq, struct page **pagepool); char *name; @@ -92,6 +94,12 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size); +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size); int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index db5e4b7636ec..c98aeda8abb2 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -5,9 +5,7 @@ * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" -#include <linux/prefetch.h> #include <linux/sched/mm.h> -#include <linux/dax.h> #include <trace/events/erofs.h> void erofs_unmap_metabuf(struct erofs_buf *buf) @@ -222,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev; + map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -240,7 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev; + map->m_bdev = dif->bdev_handle ? + dif->bdev_handle->bdev : NULL; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -413,14 +412,14 @@ const struct address_space_operations erofs_raw_access_aops = { #ifdef CONFIG_FS_DAX static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { - return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); + return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); } static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); + return erofs_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct erofs_dax_vm_ops = { diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index cfad1eac7fd9..021be5feb1bc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -4,7 +4,6 @@ * https://www.huawei.com/ */ #include "compress.h" -#include <linux/module.h> #include <linux/lz4.h> #ifndef LZ4_DISTANCE_MAX /* history window size */ @@ -24,11 +23,11 @@ struct z_erofs_lz4_decompress_ctx { unsigned int oend; }; -int z_erofs_load_lz4_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lz4_cfgs *lz4, int size) +static int z_erofs_load_lz4_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) { struct erofs_sb_info *sbi = EROFS_SB(sb); + struct z_erofs_lz4_cfgs *lz4 = data; u16 distance; if (lz4) { @@ -370,13 +369,75 @@ const struct z_erofs_decompressor erofs_decompressors[] = { .name = "interlaced" }, [Z_EROFS_COMPRESSION_LZ4] = { + .config = z_erofs_load_lz4_config, .decompress = z_erofs_lz4_decompress, .name = "lz4" }, #ifdef CONFIG_EROFS_FS_ZIP_LZMA [Z_EROFS_COMPRESSION_LZMA] = { + .config = z_erofs_load_lzma_config, .decompress = z_erofs_lzma_decompress, .name = "lzma" }, #endif +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE + [Z_EROFS_COMPRESSION_DEFLATE] = { + .config = z_erofs_load_deflate_config, + .decompress = z_erofs_deflate_decompress, + .name = "deflate" + }, +#endif }; + +int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + unsigned int algs, alg; + erofs_off_t offset; + int size, ret = 0; + + if (!erofs_sb_has_compr_cfgs(sbi)) { + sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4; + return z_erofs_load_lz4_config(sb, dsb, NULL, 0); + } + + sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); + if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { + erofs_err(sb, "unidentified algorithms %x, please upgrade kernel", + sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); + return -EOPNOTSUPP; + } + + erofs_init_metabuf(&buf, sb); + offset = EROFS_SUPER_OFFSET + sbi->sb_size; + alg = 0; + for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { + void *data; + + if (!(algs & 1)) + continue; + + data = erofs_read_metadata(sb, &buf, &offset, &size); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + + if (alg >= ARRAY_SIZE(erofs_decompressors) || + !erofs_decompressors[alg].config) { + erofs_err(sb, "algorithm %d isn't enabled on this kernel", + alg); + ret = -EOPNOTSUPP; + } else { + ret = erofs_decompressors[alg].config(sb, + dsb, data, size); + } + + kfree(data); + if (ret) + break; + } + erofs_put_metabuf(&buf); + return ret; +} diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c new file mode 100644 index 000000000000..daf3c1bdeab8 --- /dev/null +++ b/fs/erofs/decompressor_deflate.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/zlib.h> +#include "compress.h" + +struct z_erofs_deflate { + struct z_erofs_deflate *next; + struct z_stream_s z; + u8 bounce[PAGE_SIZE]; +}; + +static DEFINE_SPINLOCK(z_erofs_deflate_lock); +static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms; +static struct z_erofs_deflate *z_erofs_deflate_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq); + +module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444); + +void z_erofs_deflate_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + continue; + } + z_erofs_deflate_head = NULL; + spin_unlock(&z_erofs_deflate_lock); + + while (strm) { + struct z_erofs_deflate *n = strm->next; + + vfree(strm->z.workspace); + kfree(strm); + --z_erofs_deflate_avail_strms; + strm = n; + } + } +} + +int __init z_erofs_deflate_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_deflate_nstrms) + z_erofs_deflate_nstrms = num_possible_cpus(); + + for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms; + ++z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) + goto out_failed; + + /* XXX: in-kernel zlib cannot shrink windowbits currently */ + strm->z.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!strm->z.workspace) { + kfree(strm); + goto out_failed; + } + + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + } + return 0; + +out_failed: + pr_err("failed to allocate zlib workspace\n"); + z_erofs_deflate_exit(); + return -ENOMEM; +} + +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, void *data, int size) +{ + struct z_erofs_deflate_cfgs *dfl = data; + + if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) { + erofs_err(sb, "invalid deflate cfgs, size=%u", size); + return -EINVAL; + } + + if (dfl->windowbits > MAX_WBITS) { + erofs_err(sb, "unsupported windowbits %u", dfl->windowbits); + return -EOPNOTSUPP; + } + + erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!"); + return 0; +} + +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + struct super_block *sb = rq->sb; + unsigned int insz, outsz, pofs; + struct z_erofs_deflate *strm; + u8 *kin, *kout = NULL; + bool bounced = false; + int no = -1, ni = 0, j = 0, zerr, err; + + /* 1. get the exact DEFLATE compressed size */ + kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap_local(kin); + return err; + } + + /* 2. get an available DEFLATE context */ +again: + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head)); + goto again; + } + z_erofs_deflate_head = strm->next; + spin_unlock(&z_erofs_deflate_lock); + + /* 3. multi-call decompress */ + insz = rq->inputsize; + outsz = rq->outputsize; + zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS); + if (zerr != Z_OK) { + err = -EIO; + goto failed_zinit; + } + + pofs = rq->pageofs_out; + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in); + insz -= strm->z.avail_in; + strm->z.next_in = kin + rq->pageofs_in; + strm->z.avail_out = 0; + + while (1) { + if (!strm->z.avail_out) { + if (++no >= nrpages_out || !outsz) { + erofs_err(sb, "insufficient space for decompressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) + kunmap_local(kout); + strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); + outsz -= strm->z.avail_out; + if (!rq->out[no]) { + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + pofs; + pofs = 0; + } + + if (!strm->z.avail_in && insz) { + if (++ni >= nrpages_in) { + erofs_err(sb, "invalid compressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) { /* unlike kmap(), take care of the orders */ + j = strm->z.next_out - kout; + kunmap_local(kout); + } + kunmap_local(kin); + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE); + insz -= strm->z.avail_in; + kin = kmap_local_page(rq->in[ni]); + strm->z.next_in = kin; + bounced = false; + if (kout) { + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + j; + } + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Or use short-lived pages from the + * on-stack pagepool where pages share among the same request + * and not _all_ inplace I/O pages are needed to be doubled. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in); + strm->z.next_in = strm->bounce; + bounced = true; + } + + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + + zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH); + if (zerr != Z_OK || !(outsz + strm->z.avail_out)) { + if (zerr == Z_OK && rq->partial_decoding) + break; + if (zerr == Z_STREAM_END && !outsz) + break; + erofs_err(sb, "failed to decompress %d in[%u] out[%u]", + zerr, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + + if (zlib_inflateEnd(&strm->z) != Z_OK && !err) + err = -EIO; + if (kout) + kunmap_local(kout); +failed_zinit: + kunmap_local(kin); + /* 4. push back DEFLATE stream context to the global list */ + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + wake_up(&z_erofs_deflate_wq); + return err; +} diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 73091fbe3ea4..2dd14f99c1dc 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/xz.h> -#include <linux/module.h> #include "compress.h" struct z_erofs_lzma { @@ -72,10 +71,10 @@ int __init z_erofs_lzma_init(void) } int z_erofs_load_lzma_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lzma_cfgs *lzma, int size) + struct erofs_super_block *dsb, void *data, int size) { static DEFINE_MUTEX(lzma_resize_mutex); + struct z_erofs_lzma_cfgs *lzma = data; unsigned int dict_size, i; struct z_erofs_lzma *strm, *head = NULL; int err; @@ -96,8 +95,6 @@ int z_erofs_load_lzma_config(struct super_block *sb, return -EINVAL; } - erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); - /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ mutex_lock(&lzma_resize_mutex); @@ -217,9 +214,12 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; - if (!rq->out[no] && rq->fillgaps) /* deduped */ + if (!rq->out[no] && rq->fillgaps) { /* deduped */ rq->out[no] = erofs_allocpage(pagepool, GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2c7b16e340fe..a03ec70ba6f2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -13,6 +13,7 @@ #define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 #define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -81,7 +82,8 @@ struct erofs_super_block { __u8 xattr_prefix_count; /* # of long xattr name prefixes */ __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ - __u8 reserved2[24]; + __u8 xattr_filter_reserved; /* reserved for xattr name filter */ + __u8 reserved2[23]; }; /* @@ -200,7 +202,7 @@ struct erofs_inode_extended { * for read-only fs, no need to introduce h_refcount */ struct erofs_xattr_ibody_header { - __le32 h_reserved; + __le32 h_name_filter; /* bit value 1 indicates not-present */ __u8 h_shared_count; __u8 h_reserved2[7]; __le32 h_shared_xattrs[]; /* shared xattr id array */ @@ -221,6 +223,10 @@ struct erofs_xattr_ibody_header { #define EROFS_XATTR_LONG_PREFIX 0x80 #define EROFS_XATTR_LONG_PREFIX_MASK 0x7f +#define EROFS_XATTR_FILTER_BITS 32 +#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX +#define EROFS_XATTR_FILTER_SEED 0x25BBE08F + /* xattr entry (for both inline & shared xattrs) */ struct erofs_xattr_entry { __u8 e_name_len; /* length of name */ @@ -289,6 +295,7 @@ struct erofs_dirent { enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_LZMA = 1, + Z_EROFS_COMPRESSION_DEFLATE = 2, Z_EROFS_COMPRESSION_MAX }; #define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) @@ -309,6 +316,12 @@ struct z_erofs_lzma_cfgs { #define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_deflate_cfgs { + u8 windowbits; /* 8..15 for DEFLATE */ + u8 reserved[5]; +} __packed; + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index e12592727a54..14a79d3226ab 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -15,11 +15,11 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); const erofs_off_t inode_loc = erofs_iloc(inode); - erofs_blk_t blkaddr, nblks = 0; void *kaddr; struct erofs_inode_compact *dic; struct erofs_inode_extended *die, *copied = NULL; + union erofs_inode_i_u iu; unsigned int ifmt; int err; @@ -35,9 +35,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, dic = kaddr + *ofs; ifmt = le16_to_cpu(dic->i_format); - if (ifmt & ~EROFS_I_ALL) { - erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu", + erofs_err(sb, "unsupported i_format %u of nid %llu", ifmt, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -45,7 +44,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->datalayout = erofs_inode_datalayout(ifmt); if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) { - erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu", + erofs_err(sb, "unsupported datalayout %u of nid %llu", vi->datalayout, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -82,40 +81,15 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(die->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } + iu = die->i_u; i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - - /* extended inode has its own timestamp */ - inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime); - inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec); + /* each extended inode has its own timestamp */ + inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); - - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(die->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - /* fill chunked inode summary info */ - vi->chunkformat = le16_to_cpu(die->i_u.c.format); kfree(copied); copied = NULL; break; @@ -125,50 +99,51 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr); - break; - case S_IFCHR: - case S_IFBLK: - inode->i_rdev = - new_decode_dev(le32_to_cpu(dic->i_u.rdev)); - break; - case S_IFIFO: - case S_IFSOCK: - inode->i_rdev = 0; - break; - default: - goto bogusimode; - } + iu = dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time for compact inodes */ - inode->i_ctime.tv_sec = sbi->build_time; - inode->i_ctime.tv_nsec = sbi->build_time_nsec; + inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); inode->i_size = le32_to_cpu(dic->i_size); - if (erofs_inode_is_data_compressed(vi->datalayout)) - nblks = le32_to_cpu(dic->i_u.compressed_blocks); - else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) - vi->chunkformat = le16_to_cpu(dic->i_u.c.format); break; default: - erofs_err(inode->i_sb, - "unsupported on-disk inode version %u of nid %llu", + erofs_err(sb, "unsupported on-disk inode version %u of nid %llu", erofs_inode_version(ifmt), vi->nid); err = -EOPNOTSUPP; goto err_out; } - if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + break; + case S_IFCHR: + case S_IFBLK: + inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev)); + break; + case S_IFIFO: + case S_IFSOCK: + inode->i_rdev = 0; + break; + default: + erofs_err(sb, "bogus i_mode (%o) @ nid %llu", inode->i_mode, + vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } + + /* total blocks for compressed files */ + if (erofs_inode_is_data_compressed(vi->datalayout)) { + nblks = le32_to_cpu(iu.compressed_blocks); + } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(iu.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { - erofs_err(inode->i_sb, - "unsupported chunk format %x of nid %llu", + erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); err = -EOPNOTSUPP; goto err_out; @@ -176,10 +151,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; - inode->i_atime.tv_sec = inode->i_ctime.tv_sec; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_get_ctime(inode))); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && @@ -194,10 +167,6 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); return kaddr; -bogusimode: - erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu", - inode->i_mode, vi->nid); - err = -EFSCORRUPTED; err_out: DBG_BUGON(1); kfree(copied); @@ -373,7 +342,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 36e32fa542f0..b0409badb017 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -8,8 +8,10 @@ #define __EROFS_INTERNAL_H #include <linux/fs.h> +#include <linux/dax.h> #include <linux/dcache.h> #include <linux/mm.h> +#include <linux/module.h> #include <linux/pagemap.h> #include <linux/bio.h> #include <linux/magic.h> @@ -47,7 +49,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct dax_device *dax_dev; u64 dax_part_off; @@ -151,6 +153,7 @@ struct erofs_sb_info { u32 xattr_prefix_start; u8 xattr_prefix_count; struct erofs_xattr_prefix_item *xattr_prefixes; + unsigned int xattr_filter_reserved; #endif u16 device_id_mask; /* valid bits of device id to be used */ @@ -227,8 +230,6 @@ struct erofs_buf { }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define ROOT_NID(sb) ((sb)->root_nid) - #define erofs_blknr(sb, addr) ((addr) >> (sb)->s_blocksize_bits) #define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) @@ -251,6 +252,7 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) +EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 @@ -270,6 +272,7 @@ struct erofs_inode { unsigned char inode_isize; unsigned int xattr_isize; + unsigned int xattr_name_filter; unsigned int xattr_shared_count; unsigned int *xattr_shared_xattrs; @@ -466,9 +469,6 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int z_erofs_load_lz4_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lz4_cfgs *lz4, int len); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags); void *erofs_get_pcpubuf(unsigned int requiredpages); @@ -477,6 +477,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages); void __init erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); int erofs_init_managed_cache(struct super_block *sb); +int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -484,16 +485,6 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_zip_subsystem(void) { return 0; } static inline void z_erofs_exit_zip_subsystem(void) {} -static inline int z_erofs_load_lz4_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lz4_cfgs *lz4, int len) -{ - if (lz4 || dsb->u1.lz4_max_distance) { - erofs_err(sb, "lz4 algorithm isn't enabled"); - return -EINVAL; - } - return 0; -} static inline void erofs_pcpubuf_init(void) {} static inline void erofs_pcpubuf_exit(void) {} static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } @@ -502,23 +493,19 @@ static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #ifdef CONFIG_EROFS_FS_ZIP_LZMA int __init z_erofs_lzma_init(void); void z_erofs_lzma_exit(void); -int z_erofs_load_lzma_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lzma_cfgs *lzma, int size); #else static inline int z_erofs_lzma_init(void) { return 0; } static inline int z_erofs_lzma_exit(void) { return 0; } -static inline int z_erofs_load_lzma_config(struct super_block *sb, - struct erofs_super_block *dsb, - struct z_erofs_lzma_cfgs *lzma, int size) { - if (lzma) { - erofs_err(sb, "lzma algorithm isn't enabled"); - return -EINVAL; - } - return 0; -} #endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE +int __init z_erofs_deflate_init(void); +void z_erofs_deflate_exit(void); +#else +static inline int z_erofs_deflate_init(void) { return 0; } +static inline int z_erofs_deflate_exit(void) { return 0; } +#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */ + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 566f68ddfa36..3789d6224513 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -4,14 +4,11 @@ * https://www.huawei.com/ * Copyright (C) 2021, Alibaba Cloud */ -#include <linux/module.h> #include <linux/statfs.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/crc32c.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> -#include <linux/dax.h> #include <linux/exportfs.h> #include "xattr.h" @@ -19,10 +16,8 @@ #include <trace/events/erofs.h> static struct kmem_cache *erofs_inode_cachep __read_mostly; -struct file_system_type erofs_fs_type; -void _erofs_err(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -32,12 +27,11 @@ void _erofs_err(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf); + pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); va_end(args); } -void _erofs_info(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -102,11 +96,9 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - /* be careful of RCU symlink path */ if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); - kmem_cache_free(erofs_inode_cachep, vi); } @@ -119,8 +111,7 @@ static bool check_layout_compatibility(struct super_block *sb, /* check if current kernel meets all mandatory requirements */ if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, - "unidentified incompatible feature %x, please upgrade kernel version", + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", feature & ~EROFS_ALL_FEATURE_INCOMPAT); return false; } @@ -162,65 +153,15 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, return buffer; } -#ifdef CONFIG_EROFS_FS_ZIP -static int erofs_load_compr_cfgs(struct super_block *sb, - struct erofs_super_block *dsb) +#ifndef CONFIG_EROFS_FS_ZIP +static int z_erofs_parse_cfgs(struct super_block *sb, + struct erofs_super_block *dsb) { - struct erofs_sb_info *sbi = EROFS_SB(sb); - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - unsigned int algs, alg; - erofs_off_t offset; - int size, ret = 0; - - sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); - if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { - erofs_err(sb, "try to load compressed fs with unsupported algorithms %x", - sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); - return -EINVAL; - } - - erofs_init_metabuf(&buf, sb); - offset = EROFS_SUPER_OFFSET + sbi->sb_size; - alg = 0; - for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { - void *data; - - if (!(algs & 1)) - continue; - - data = erofs_read_metadata(sb, &buf, &offset, &size); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - break; - } + if (!dsb->u1.available_compr_algs) + return 0; - switch (alg) { - case Z_EROFS_COMPRESSION_LZ4: - ret = z_erofs_load_lz4_config(sb, dsb, data, size); - break; - case Z_EROFS_COMPRESSION_LZMA: - ret = z_erofs_load_lzma_config(sb, dsb, data, size); - break; - default: - DBG_BUGON(1); - ret = -EFAULT; - } - kfree(data); - if (ret) - break; - } - erofs_put_metabuf(&buf); - return ret; -} -#else -static int erofs_load_compr_cfgs(struct super_block *sb, - struct erofs_super_block *dsb) -{ - if (dsb->u1.available_compr_algs) { - erofs_err(sb, "try to load compressed fs when compression is disabled"); - return -EINVAL; - } - return 0; + erofs_err(sb, "compression disabled, unable to mount compressed EROFS"); + return -EOPNOTSUPP; } #endif @@ -230,7 +171,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct block_device *bdev; + struct bdev_handle *bdev_handle; void *ptr; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); @@ -238,7 +179,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(ptr); dis = ptr + erofs_blkoff(sb, *pos); - if (!dif->path) { + if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { erofs_err(sb, "empty device tag @ pos %llu", *pos); return -EINVAL; @@ -254,13 +195,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type, - NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, - NULL, NULL); + bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ, + sb->s_type, NULL); + if (IS_ERR(bdev_handle)) + return PTR_ERR(bdev_handle); + dif->bdev_handle = bdev_handle; + dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, + &dif->dax_part_off, NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -388,6 +329,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); sbi->xattr_prefix_count = dsb->xattr_prefix_count; + sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); @@ -408,10 +350,7 @@ static int erofs_read_superblock(struct super_block *sb) } /* parse on-disk compression configurations */ - if (erofs_sb_has_compr_cfgs(sbi)) - ret = erofs_load_compr_cfgs(sb, dsb); - else - ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); + ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) goto out; @@ -420,16 +359,11 @@ static int erofs_read_superblock(struct super_block *sb) if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); - if (erofs_sb_has_fragments(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); - if (erofs_sb_has_dedupe(sbi)) - erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; } -/* set up default EROFS parameters */ static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP @@ -633,6 +567,7 @@ static struct dentry *erofs_get_parent(struct dentry *child) } static const struct export_operations erofs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = erofs_fh_to_dentry, .fh_to_parent = erofs_fh_to_parent, .get_parent = erofs_get_parent, @@ -731,14 +666,13 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) xa_init(&sbi->managed_pslots); #endif - /* get the root inode */ - inode = erofs_iget(sb, ROOT_NID(sbi)); + inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) return PTR_ERR(inode); if (!S_ISDIR(inode->i_mode)) { erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)", - ROOT_NID(sbi), inode->i_mode); + sbi->root_nid, inode->i_mode); iput(inode); return -EINVAL; } @@ -748,7 +682,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; erofs_shrinker_register(sb); - /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); if (IS_ERR(sbi->packed_inode)) { @@ -769,7 +702,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) if (err) return err; - erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi)); + erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid); return 0; } @@ -815,8 +748,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev) - blkdev_put(dif->bdev, &erofs_fs_type); + if (dif->bdev_handle) + bdev_release(dif->bdev_handle); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -881,10 +814,6 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } -/* - * could be triggered after deactivate_locked_super() - * is called, thus including umount and failed to initialize. - */ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -913,7 +842,6 @@ static void erofs_kill_sb(struct super_block *sb) sb->s_fs_info = NULL; } -/* called when ->s_root is non-NULL */ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); @@ -950,9 +878,9 @@ static int __init erofs_module_init(void) erofs_check_ondisk_layout_definitions(); erofs_inode_cachep = kmem_cache_create("erofs_inode", - sizeof(struct erofs_inode), 0, - SLAB_RECLAIM_ACCOUNT, - erofs_inode_init_once); + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; @@ -964,6 +892,10 @@ static int __init erofs_module_init(void) if (err) goto lzma_err; + err = z_erofs_deflate_init(); + if (err) + goto deflate_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -984,6 +916,8 @@ fs_err: sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_deflate_exit(); +deflate_err: z_erofs_lzma_exit(); lzma_err: erofs_exit_shrinker(); @@ -1001,13 +935,13 @@ static void __exit erofs_module_exit(void) erofs_exit_sysfs(); z_erofs_exit_zip_subsystem(); + z_erofs_deflate_exit(); z_erofs_lzma_exit(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } -/* get filesystem statistics */ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index cc6fb9e98899..5dea308764b4 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -77,12 +77,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, struct erofs_sb_info *const sbi = EROFS_SB(sb); struct erofs_workgroup *pre; - /* - * Bump up before making this visible to others for the XArray in order - * to avoid potential UAF without serialized by xa_lock. - */ - lockref_get(&grp->lockref); - + DBG_BUGON(grp->lockref.count < 1); repeat: xa_lock(&sbi->managed_pslots); pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index, @@ -96,7 +91,6 @@ repeat: cond_resched(); goto repeat; } - lockref_put_return(&grp->lockref); grp = pre; } xa_unlock(&sbi->managed_pslots); @@ -270,19 +264,24 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, return freed; } -static struct shrinker erofs_shrinker_info = { - .scan_objects = erofs_shrink_scan, - .count_objects = erofs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *erofs_shrinker_info; int __init erofs_init_shrinker(void) { - return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); + erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); + if (!erofs_shrinker_info) + return -ENOMEM; + + erofs_shrinker_info->count_objects = erofs_shrink_count; + erofs_shrinker_info->scan_objects = erofs_shrink_scan; + + shrinker_register(erofs_shrinker_info); + + return 0; } void erofs_exit_shrinker(void) { - unregister_shrinker(&erofs_shrinker_info); + shrinker_free(erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 40178b6e0688..b58316b49a43 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -5,6 +5,7 @@ * Copyright (C) 2021-2022, Alibaba Cloud */ #include <linux/security.h> +#include <linux/xxhash.h> #include "xattr.h" struct erofs_xattr_iter { @@ -87,6 +88,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) } ih = it.kaddr + erofs_blkoff(sb, it.pos); + vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -166,7 +168,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { }; #endif -const struct xattr_handler *erofs_xattr_handlers[] = { +const struct xattr_handler * const erofs_xattr_handlers[] = { &erofs_xattr_user_handler, &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY @@ -392,7 +394,10 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { int ret; + unsigned int hashbit; struct erofs_xattr_iter it; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!name) return -EINVAL; @@ -401,6 +406,15 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, if (ret) return ret; + /* reserved flag is non-zero if there's any change of on-disk format */ + if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) { + hashbit = xxh32(name, strlen(name), + EROFS_XATTR_FILTER_SEED + index); + hashbit &= EROFS_XATTR_FILTER_BITS - 1; + if (vi->xattr_name_filter & (1U << hashbit)) + return -ENOATTR; + } + it.index = index; it.name = (struct qstr)QSTR_INIT(name, strlen(name)); if (it.name.len > EROFS_NAME_LEN) diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index f16283cb8c93..b246cd0e135e 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx, { const struct xattr_handler *handler = NULL; - static const struct xattr_handler *xattr_handler_map[] = { + static const struct xattr_handler * const xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, @@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx, return xattr_prefix(handler); } -extern const struct xattr_handler *erofs_xattr_handlers[]; +extern const struct xattr_handler * const erofs_xattr_handlers[]; int erofs_xattr_prefixes_init(struct super_block *sb); void erofs_xattr_prefixes_cleanup(struct super_block *sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index de4f12152b62..a7e6847f6f8f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -143,22 +143,17 @@ static inline void z_erofs_onlinepage_split(struct page *page) atomic_inc((atomic_t *)&page->private); } -static inline void z_erofs_page_mark_eio(struct page *page) +static void z_erofs_onlinepage_endio(struct page *page, int err) { - int orig; + int orig, v; + + DBG_BUGON(!PagePrivate(page)); do { orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; + v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); @@ -507,19 +502,17 @@ enum z_erofs_pclustermode { */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | | | - * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * - * [ (*) the above page can be used as inplace I/O. ] + * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; @@ -535,8 +528,6 @@ struct z_erofs_decompress_frontend { z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - /* used for applying cache strategy on the fly */ - bool backmost; erofs_off_t headoffset; /* a pointer used to pick up inplace I/O pages */ @@ -545,7 +536,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } + .mode = Z_EROFS_PCLUSTER_FOLLOWED } static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) { @@ -554,7 +545,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) return false; - if (fe->backmost) + if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) return true; if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && @@ -805,6 +796,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) return PTR_ERR(pcl); spin_lock_init(&pcl->obj.lockref.lock); + pcl->obj.lockref.count = 1; /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; @@ -851,9 +843,11 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; @@ -863,8 +857,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; @@ -883,9 +876,26 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -908,12 +918,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); @@ -929,37 +939,29 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - return true; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct super_block *sb = inode->i_sb; - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; buf.inode = packed_inode; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; @@ -972,94 +974,60 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, spiltted; + unsigned int cur, end, len, split; int err = 0; - /* register locked file pages as online pages in pack */ z_erofs_onlinepage_init(page); - spiltted = 0; + split = 0; end = PAGE_SIZE; repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { - if (z_erofs_collector_end(fe)) - fe->backmost = false; - map->m_la = offset + cur; + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + z_erofs_pcluster_end(fe); + map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; - - err = z_erofs_collector_begin(fe); - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; - - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(inode->i_sb, map->m_pa), - EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); - goto out; - } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe); - } -hitted: - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; - cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); + tight = false; goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; + erofs_off_t fpos = offset + cur - map->m_la; - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; - } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; - ++spiltted; tight = false; goto next_part; } - exclusive = (!cur && (!spiltted || tight)); + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or bvpage (should be processed in a strict order.) + */ + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + exclusive = (!cur && ((split <= 1) || tight)); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); @@ -1072,8 +1040,6 @@ hitted: goto out; z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { @@ -1094,9 +1060,7 @@ next_part: goto repeat; out: - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); return err; } @@ -1199,9 +1163,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - if (err) - z_erofs_page_mark_eio(bvi->bvec.page); - z_erofs_onlinepage_endio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page, err); list_del(p); kfree(bvi); } @@ -1372,9 +1334,7 @@ out: /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); } if (be->decompressed_pages != be->onstack_pages) @@ -1410,7 +1370,10 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(be.pcl->next); z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); - erofs_workgroup_put(&be.pcl->obj); + if (z_erofs_is_inline_pcluster(be.pcl)) + z_erofs_free_pcluster(be.pcl); + else + erofs_workgroup_put(&be.pcl->obj); } } @@ -1848,15 +1811,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, page = erofs_grab_cache_page_nowait(inode->i_mapping, index); if (page) { - if (PageUptodate(page)) { + if (PageUptodate(page)) unlock_page(page); - } else { - err = z_erofs_do_read_page(f, page); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - } + else + (void)z_erofs_do_read_page(f, page); put_page(page); } @@ -1868,25 +1826,25 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *const inode = page->mapping->host; + struct inode *const inode = folio->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; - trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + trace_erofs_read_folio(folio, false); + f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, page); + err = z_erofs_do_read_page(&f, &folio->page); z_erofs_pcluster_readmore(&f, NULL, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - if (err) - erofs_err(inode->i_sb, "failed to read, err [%d]", err); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", + err, folio->index, EROFS_I(inode)->nid); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); @@ -1898,38 +1856,35 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *head = NULL, *page; - unsigned int nr_pages; + struct folio *head = NULL, *folio; + unsigned int nr_folios; + int err; f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, true); - nr_pages = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + nr_folios = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); - while ((page = readahead_page(rac))) { - set_page_private(page, (unsigned long)head); - head = page; + while ((folio = readahead_folio(rac))) { + folio->private = head; + head = folio; } + /* traverse in reverse order for best metadata I/O performance */ while (head) { - struct page *page = head; - int err; - - /* traversal in reverse order */ - head = (void *)page_private(page); + folio = head; + head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, page); - if (err) - erofs_err(inode->i_sb, - "readahead error at page %lu @ nid %llu", - page->index, EROFS_I(inode)->nid); - put_page(page); + err = z_erofs_do_read_page(&f, &folio->page); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); } z_erofs_pcluster_readmore(&f, rac, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1909ddafd9c7..7b55111fd533 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -561,8 +561,9 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && - map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && - map->m_llen >= i_blocksize(inode))) { + (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) && + map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; diff --git a/fs/eventfd.c b/fs/eventfd.c index 8aa36cd37351..33a918f9566c 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -189,7 +189,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { lockdep_assert_held(&ctx->wqh.lock); - *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count; ctx->count -= *cnt; } EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4b1b3362f697..2877cc01cff1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -256,10 +256,10 @@ static u64 loop_check_gen = 0; static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ -static struct kmem_cache *epi_cache __read_mostly; +static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ -static struct kmem_cache *pwq_cache __read_mostly; +static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number @@ -271,7 +271,7 @@ struct epitems_head { }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; -static struct kmem_cache *ephead_cache __read_mostly; +static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { @@ -975,15 +975,11 @@ again: static int ep_alloc(struct eventpoll **pep) { - int error; - struct user_struct *user; struct eventpoll *ep; - user = get_current_user(); - error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) - goto free_uid; + return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); @@ -992,16 +988,12 @@ static int ep_alloc(struct eventpoll **pep) INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; - ep->user = user; + ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; - -free_uid: - free_uid(user); - return error; } /* diff --git a/fs/exec.c b/fs/exec.c index 1a827d55ba94..4aa19b24f281 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) if (vma != vma_next(&vmi)) return -EFAULT; + vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ @@ -712,7 +713,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, - vma, new_start, length, false)) + vma, new_start, length, false, true)) return -ENOMEM; lru_add_drain(); @@ -985,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm) tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); - if (old_mm) - sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) @@ -1276,8 +1275,8 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Must be called _before_ exec_mmap() as bprm->mm is - * not visible until then. This also enables the update - * to be lockless. + * not visible until then. Doing it here also ensures + * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig index 147edeb04469..cbeca8e44d9b 100644 --- a/fs/exfat/Kconfig +++ b/fs/exfat/Kconfig @@ -2,6 +2,7 @@ config EXFAT_FS tristate "exFAT filesystem support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index e1586bba6d86..9f9295847a4e 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -287,7 +287,7 @@ get_new: mutex_unlock(&EXFAT_SB(sb)->s_lock); if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum, - (de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG)) + (de.attr & EXFAT_ATTR_SUBDIR) ? DT_DIR : DT_REG)) goto out; ctx->pos = cpos; goto get_new; @@ -359,7 +359,7 @@ unsigned int exfat_get_entry_type(struct exfat_dentry *ep) if (ep->type == EXFAT_VOLUME) return TYPE_VOLUME; if (ep->type == EXFAT_FILE) { - if (le16_to_cpu(ep->dentry.file.attr) & ATTR_SUBDIR) + if (le16_to_cpu(ep->dentry.file.attr) & EXFAT_ATTR_SUBDIR) return TYPE_DIR; return TYPE_FILE; } @@ -410,19 +410,21 @@ static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type) ep->type = EXFAT_VOLUME; } else if (type == TYPE_DIR) { ep->type = EXFAT_FILE; - ep->dentry.file.attr = cpu_to_le16(ATTR_SUBDIR); + ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_SUBDIR); } else if (type == TYPE_FILE) { ep->type = EXFAT_FILE; - ep->dentry.file.attr = cpu_to_le16(ATTR_ARCHIVE); + ep->dentry.file.attr = cpu_to_le16(EXFAT_ATTR_ARCHIVE); } } static void exfat_init_stream_entry(struct exfat_dentry *ep, - unsigned char flags, unsigned int start_clu, - unsigned long long size) + unsigned int start_clu, unsigned long long size) { exfat_set_entry_type(ep, TYPE_STREAM); - ep->dentry.stream.flags = flags; + if (size == 0) + ep->dentry.stream.flags = ALLOC_FAT_CHAIN; + else + ep->dentry.stream.flags = ALLOC_NO_FAT_CHAIN; ep->dentry.stream.start_clu = cpu_to_le32(start_clu); ep->dentry.stream.valid_size = cpu_to_le64(size); ep->dentry.stream.size = cpu_to_le64(size); @@ -488,9 +490,7 @@ int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, if (!ep) return -EIO; - exfat_init_stream_entry(ep, - (type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN, - start_clu, size); + exfat_init_stream_entry(ep, start_clu, size); exfat_update_bh(bh, IS_DIRSYNC(inode)); brelse(bh); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 729ada9e26e8..a7a2c35d74fb 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -234,6 +234,8 @@ struct exfat_mount_options { discard:1, /* Issue discard requests on deletions */ keep_last_dots:1; /* Keep trailing periods in paths */ int time_offset; /* Offset of timestamps from UTC (in minutes) */ + /* Support creating zero-size directory, default: false */ + bool zero_size_dir; }; /* @@ -273,8 +275,6 @@ struct exfat_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; - - struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 @@ -359,10 +359,10 @@ static inline int exfat_mode_can_hold_ro(struct inode *inode) static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi, unsigned short attr, mode_t mode) { - if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR)) + if ((attr & EXFAT_ATTR_READONLY) && !(attr & EXFAT_ATTR_SUBDIR)) mode &= ~0222; - if (attr & ATTR_SUBDIR) + if (attr & EXFAT_ATTR_SUBDIR) return (mode & ~sbi->options.fs_dmask) | S_IFDIR; return (mode & ~sbi->options.fs_fmask) | S_IFREG; @@ -374,18 +374,18 @@ static inline unsigned short exfat_make_attr(struct inode *inode) unsigned short attr = EXFAT_I(inode)->attr; if (S_ISDIR(inode->i_mode)) - attr |= ATTR_SUBDIR; + attr |= EXFAT_ATTR_SUBDIR; if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & 0222)) - attr |= ATTR_READONLY; + attr |= EXFAT_ATTR_READONLY; return attr; } static inline void exfat_save_attr(struct inode *inode, unsigned short attr) { if (exfat_mode_can_hold_ro(inode)) - EXFAT_I(inode)->attr = attr & (ATTR_RWMASK | ATTR_READONLY); + EXFAT_I(inode)->attr = attr & (EXFAT_ATTR_RWMASK | EXFAT_ATTR_READONLY); else - EXFAT_I(inode)->attr = attr & ATTR_RWMASK; + EXFAT_I(inode)->attr = attr & EXFAT_ATTR_RWMASK; } static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi, @@ -551,6 +551,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_cs); void exfat_truncate_atime(struct timespec64 *ts); +void exfat_truncate_inode_atime(struct inode *inode); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_cs); u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type); diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 0ece2e43cf49..971a1ccd0e89 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -64,15 +64,16 @@ #define CS_DEFAULT 2 /* file attributes */ -#define ATTR_READONLY 0x0001 -#define ATTR_HIDDEN 0x0002 -#define ATTR_SYSTEM 0x0004 -#define ATTR_VOLUME 0x0008 -#define ATTR_SUBDIR 0x0010 -#define ATTR_ARCHIVE 0x0020 - -#define ATTR_RWMASK (ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \ - ATTR_SUBDIR | ATTR_ARCHIVE) +#define EXFAT_ATTR_READONLY 0x0001 +#define EXFAT_ATTR_HIDDEN 0x0002 +#define EXFAT_ATTR_SYSTEM 0x0004 +#define EXFAT_ATTR_VOLUME 0x0008 +#define EXFAT_ATTR_SUBDIR 0x0010 +#define EXFAT_ATTR_ARCHIVE 0x0020 + +#define EXFAT_ATTR_RWMASK (EXFAT_ATTR_HIDDEN | EXFAT_ATTR_SYSTEM | \ + EXFAT_ATTR_VOLUME | EXFAT_ATTR_SUBDIR | \ + EXFAT_ATTR_ARCHIVE) #define BOOTSEC_JUMP_BOOT_LEN 3 #define BOOTSEC_FS_NAME_LEN 8 diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 3cbd270e0cba..bfdfafe00993 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -8,6 +8,9 @@ #include <linux/cred.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> +#include <linux/fsnotify.h> +#include <linux/security.h> +#include <linux/msdos_fs.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -22,7 +25,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) if (err) return err; - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); if (!IS_SYNC(inode)) @@ -144,7 +147,7 @@ int __exfat_truncate(struct inode *inode) } if (ei->type == TYPE_FILE) - ei->attr |= ATTR_ARCHIVE; + ei->attr |= EXFAT_ATTR_ARCHIVE; /* * update the directory entry @@ -232,7 +235,7 @@ int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -290,10 +293,10 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_valid & ATTR_SIZE) - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); setattr_copy(&nop_mnt_idmap, inode, attr); - exfat_truncate_atime(&inode->i_atime); + exfat_truncate_inode_atime(inode); if (attr->ia_valid & ATTR_SIZE) { error = exfat_block_truncate_page(inode, attr->ia_size); @@ -316,6 +319,93 @@ out: return error; } +/* + * modified ioctls from fat/file.c by Welmer Almesberger + */ +static int exfat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) +{ + u32 attr; + + inode_lock_shared(inode); + attr = exfat_make_attr(inode); + inode_unlock_shared(inode); + + return put_user(attr, user_attr); +} + +static int exfat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) +{ + struct inode *inode = file_inode(file); + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + int is_dir = S_ISDIR(inode->i_mode); + u32 attr, oldattr; + struct iattr ia; + int err; + + err = get_user(attr, user_attr); + if (err) + goto out; + + err = mnt_want_write_file(file); + if (err) + goto out; + inode_lock(inode); + + oldattr = exfat_make_attr(inode); + + /* + * Mask attributes so we don't set reserved fields. + */ + attr &= (EXFAT_ATTR_READONLY | EXFAT_ATTR_HIDDEN | EXFAT_ATTR_SYSTEM | + EXFAT_ATTR_ARCHIVE); + attr |= (is_dir ? EXFAT_ATTR_SUBDIR : 0); + + /* Equivalent to a chmod() */ + ia.ia_valid = ATTR_MODE | ATTR_CTIME; + ia.ia_ctime = current_time(inode); + if (is_dir) + ia.ia_mode = exfat_make_mode(sbi, attr, 0777); + else + ia.ia_mode = exfat_make_mode(sbi, attr, 0666 | (inode->i_mode & 0111)); + + /* The root directory has no attributes */ + if (inode->i_ino == EXFAT_ROOT_INO && attr != EXFAT_ATTR_SUBDIR) { + err = -EINVAL; + goto out_unlock_inode; + } + + if (((attr | oldattr) & EXFAT_ATTR_SYSTEM) && + !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock_inode; + } + + /* + * The security check is questionable... We single + * out the RO attribute for checking by the security + * module, just because it maps to a file mode. + */ + err = security_inode_setattr(file_mnt_idmap(file), + file->f_path.dentry, &ia); + if (err) + goto out_unlock_inode; + + /* This MUST be done before doing anything irreversible... */ + err = exfat_setattr(file_mnt_idmap(file), file->f_path.dentry, &ia); + if (err) + goto out_unlock_inode; + + fsnotify_change(file->f_path.dentry, ia.ia_valid); + + exfat_save_attr(inode, attr); + mark_inode_dirty(inode); +out_unlock_inode: + inode_unlock(inode); + mnt_drop_write_file(file); +out: + return err; +} + static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg) { struct fstrim_range range; @@ -346,8 +436,13 @@ static int exfat_ioctl_fitrim(struct inode *inode, unsigned long arg) long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); + u32 __user *user_attr = (u32 __user *)arg; switch (cmd) { + case FAT_IOCTL_GET_ATTRIBUTES: + return exfat_ioctl_get_attributes(inode, user_attr); + case FAT_IOCTL_SET_ATTRIBUTES: + return exfat_ioctl_set_attributes(filp, user_attr); case FITRIM: return exfat_ioctl_fitrim(inode, arg); default: diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 481dd338f2b8..e7ff58b8e68c 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -26,6 +26,7 @@ int __exfat_write_inode(struct inode *inode, int sync) struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); bool is_dir = (ei->type == TYPE_DIR) ? true : false; + struct timespec64 ts; if (inode->i_ino == EXFAT_ROOT_INO) return 0; @@ -55,16 +56,18 @@ int __exfat_write_inode(struct inode *inode, int sync) &ep->dentry.file.create_time, &ep->dentry.file.create_date, &ep->dentry.file.create_time_cs); - exfat_set_entry_time(sbi, &inode->i_mtime, - &ep->dentry.file.modify_tz, - &ep->dentry.file.modify_time, - &ep->dentry.file.modify_date, - &ep->dentry.file.modify_time_cs); - exfat_set_entry_time(sbi, &inode->i_atime, - &ep->dentry.file.access_tz, - &ep->dentry.file.access_time, - &ep->dentry.file.access_date, - NULL); + ts = inode_get_mtime(inode); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_cs); + ts = inode_get_atime(inode); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.access_tz, + &ep->dentry.file.access_time, + &ep->dentry.file.access_date, + NULL); /* File size should be zero if there is no cluster allocated */ on_disk_size = i_size_read(inode); @@ -355,7 +358,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); exfat_truncate(inode); } } @@ -397,9 +400,9 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, if (err < len) exfat_write_failed(mapping, pos+len); - if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) { - inode->i_mtime = inode->i_ctime = current_time(inode); - ei->attr |= ATTR_ARCHIVE; + if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) { + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + ei->attr |= EXFAT_ATTR_ARCHIVE; mark_inode_dirty(inode); } @@ -547,7 +550,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) inode_inc_iversion(inode); inode->i_generation = get_random_u32(); - if (info->attr & ATTR_SUBDIR) { /* directory */ + if (info->attr & EXFAT_ATTR_SUBDIR) { /* directory */ inode->i_generation &= ~1; inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); inode->i_op = &exfat_dir_inode_operations; @@ -576,10 +579,10 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; - inode->i_mtime = info->mtime; - inode->i_ctime = info->mtime; + inode_set_mtime_to_ts(inode, info->mtime); + inode_set_ctime_to_ts(inode, info->mtime); ei->i_crtime = info->crtime; - inode->i_atime = info->atime; + inode_set_atime_to_ts(inode, info->atime); return 0; } diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index 2e1a1a6b1021..fa8459828046 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -126,6 +126,14 @@ void exfat_truncate_atime(struct timespec64 *ts) ts->tv_nsec = 0; } +void exfat_truncate_inode_atime(struct inode *inode) +{ + struct timespec64 atime = inode_get_atime(inode); + + exfat_truncate_atime(&atime); + inode_set_atime_to_ts(inode, atime); +} + u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type) { int i; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index e0ff9d156f6f..5d737e0b639a 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -351,14 +351,20 @@ static int exfat_find_empty_entry(struct inode *inode, if (exfat_check_max_dentries(inode)) return -ENOSPC; - /* we trust p_dir->size regardless of FAT type */ - if (exfat_find_last_cluster(sb, p_dir, &last_clu)) - return -EIO; - /* * Allocate new cluster to this directory */ - exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags); + if (ei->start_clu != EXFAT_EOF_CLUSTER) { + /* we trust p_dir->size regardless of FAT type */ + if (exfat_find_last_cluster(sb, p_dir, &last_clu)) + return -EIO; + + exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags); + } else { + /* This directory is empty */ + exfat_chain_set(&clu, EXFAT_EOF_CLUSTER, 0, + ALLOC_NO_FAT_CHAIN); + } /* allocate a cluster */ ret = exfat_alloc_cluster(inode, 1, &clu, IS_DIRSYNC(inode)); @@ -368,6 +374,11 @@ static int exfat_find_empty_entry(struct inode *inode, if (exfat_zeroed_cluster(inode, clu.dir)) return -EIO; + if (ei->start_clu == EXFAT_EOF_CLUSTER) { + ei->start_clu = clu.dir; + p_dir->dir = clu.dir; + } + /* append to the FAT chain */ if (clu.flags != p_dir->flags) { /* no-fat-chain bit is disabled, @@ -507,7 +518,7 @@ static int exfat_add_entry(struct inode *inode, const char *path, goto out; } - if (type == TYPE_DIR) { + if (type == TYPE_DIR && !sbi->options.zero_size_dir) { ret = exfat_alloc_new_dir(inode, &clu); if (ret) goto out; @@ -534,13 +545,16 @@ static int exfat_add_entry(struct inode *inode, const char *path, info->type = type; if (type == TYPE_FILE) { - info->attr = ATTR_ARCHIVE; + info->attr = EXFAT_ATTR_ARCHIVE; info->start_clu = EXFAT_EOF_CLUSTER; info->size = 0; info->num_subdirs = 0; } else { - info->attr = ATTR_SUBDIR; - info->start_clu = start_clu; + info->attr = EXFAT_ATTR_SUBDIR; + if (sbi->options.zero_size_dir) + info->start_clu = EXFAT_EOF_CLUSTER; + else + info->start_clu = start_clu; info->size = clu_size; info->num_subdirs = EXFAT_MIN_SUBDIR; } @@ -569,7 +583,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -582,9 +596,9 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - EXFAT_I(inode)->i_crtime = current_time(inode); - exfat_truncate_atime(&inode->i_atime); + EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -646,7 +660,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); info->size = le64_to_cpu(ep2->dentry.stream.valid_size); - if ((info->type == TYPE_FILE) && (info->size == 0)) { + if (info->size == 0) { info->flags = ALLOC_NO_FAT_CHAIN; info->start_clu = EXFAT_EOF_CLUSTER; } else { @@ -817,16 +831,16 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = current_time(dir); - exfat_truncate_atime(&dir->i_atime); + simple_inode_init_ts(dir); + exfat_truncate_inode_atime(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else mark_inode_dirty(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); - exfat_truncate_atime(&inode->i_atime); + simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -852,7 +866,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -866,9 +880,8 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - EXFAT_I(inode)->i_crtime = current_time(inode); - exfat_truncate_atime(&inode->i_atime); + EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -890,6 +903,9 @@ static int exfat_check_dir_empty(struct super_block *sb, dentries_per_clu = sbi->dentries_per_clu; + if (p_dir->dir == EXFAT_EOF_CLUSTER) + return 0; + exfat_chain_dup(&clu, p_dir); while (clu.dir != EXFAT_EOF_CLUSTER) { @@ -979,8 +995,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = current_time(dir); - exfat_truncate_atime(&dir->i_atime); + simple_inode_init_ts(dir); + exfat_truncate_inode_atime(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -988,8 +1004,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); - exfat_truncate_atime(&inode->i_atime); + simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -1034,8 +1050,8 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, *epnew = *epold; if (exfat_get_entry_type(epnew) == TYPE_FILE) { - epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); - ei->attr |= ATTR_ARCHIVE; + epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE); + ei->attr |= EXFAT_ATTR_ARCHIVE; } exfat_update_bh(new_bh, sync); brelse(old_bh); @@ -1066,8 +1082,8 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, ei->entry = newentry; } else { if (exfat_get_entry_type(epold) == TYPE_FILE) { - epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); - ei->attr |= ATTR_ARCHIVE; + epold->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE); + ei->attr |= EXFAT_ATTR_ARCHIVE; } exfat_update_bh(old_bh, sync); brelse(old_bh); @@ -1115,8 +1131,8 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, *epnew = *epmov; if (exfat_get_entry_type(epnew) == TYPE_FILE) { - epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); - ei->attr |= ATTR_ARCHIVE; + epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE); + ei->attr |= EXFAT_ATTR_ARCHIVE; } exfat_update_bh(new_bh, IS_DIRSYNC(inode)); brelse(mov_bh); @@ -1257,7 +1273,8 @@ static int __exfat_rename(struct inode *old_parent_inode, } /* Free the clusters if new_inode is a dir(as if exfat_rmdir) */ - if (new_entry_type == TYPE_DIR) { + if (new_entry_type == TYPE_DIR && + new_ei->start_clu != EXFAT_EOF_CLUSTER) { /* new_ei, new_clu_to_free */ struct exfat_chain new_clu_to_free; @@ -1312,9 +1329,9 @@ static int exfat_rename(struct mnt_idmap *idmap, goto unlock; inode_inc_iversion(new_dir); - new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = - EXFAT_I(new_dir)->i_crtime = current_time(new_dir); - exfat_truncate_atime(&new_dir->i_atime); + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + exfat_truncate_inode_atime(new_dir); if (IS_DIRSYNC(new_dir)) exfat_sync_inode(new_dir); else @@ -1336,7 +1353,6 @@ static int exfat_rename(struct mnt_idmap *idmap, } inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); if (IS_DIRSYNC(old_dir)) exfat_sync_inode(old_dir); else @@ -1354,8 +1370,7 @@ static int exfat_rename(struct mnt_idmap *idmap, exfat_warn(sb, "abnormal access to an inode dropped"); WARN_ON(new_inode->i_nlink == 0); } - new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime = - current_time(new_inode); + EXFAT_I(new_inode)->i_crtime = current_time(new_inode); } unlock: diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8c32460e031e..d9d4fa91010b 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -31,16 +31,6 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi) kfree(sbi->options.iocharset); } -static void exfat_delayed_free(struct rcu_head *p) -{ - struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); - - unload_nls(sbi->nls_io); - exfat_free_iocharset(sbi); - exfat_free_upcase_table(sbi); - kfree(sbi); -} - static void exfat_put_super(struct super_block *sb) { struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -50,7 +40,8 @@ static void exfat_put_super(struct super_block *sb) brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); - call_rcu(&sbi->rcu, exfat_delayed_free); + unload_nls(sbi->nls_io); + exfat_free_upcase_table(sbi); } static int exfat_sync_fs(struct super_block *sb, int wait) @@ -174,6 +165,8 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",sys_tz"); else if (opts->time_offset) seq_printf(m, ",time_offset=%d", opts->time_offset); + if (opts->zero_size_dir) + seq_puts(m, ",zero_size_dir"); return 0; } @@ -218,6 +211,7 @@ enum { Opt_keep_last_dots, Opt_sys_tz, Opt_time_offset, + Opt_zero_size_dir, /* Deprecated options */ Opt_utf8, @@ -246,6 +240,7 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_flag("keep_last_dots", Opt_keep_last_dots), fsparam_flag("sys_tz", Opt_sys_tz), fsparam_s32("time_offset", Opt_time_offset), + fsparam_flag("zero_size_dir", Opt_zero_size_dir), __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, NULL), __fsparam(NULL, "debug", Opt_debug, fs_param_deprecated, @@ -314,6 +309,9 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; opts->time_offset = result.int_32; break; + case Opt_zero_size_dir: + opts->zero_size_dir = true; + break; case Opt_utf8: case Opt_debug: case Opt_namecase: @@ -369,7 +367,7 @@ static int exfat_read_root(struct inode *inode) inode->i_gid = sbi->options.fs_gid; inode_inc_iversion(inode); inode->i_generation = 0; - inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, 0777); + inode->i_mode = exfat_make_mode(sbi, EXFAT_ATTR_SUBDIR, 0777); inode->i_op = &exfat_dir_inode_operations; inode->i_fop = &exfat_dir_operations; @@ -378,10 +376,9 @@ static int exfat_read_root(struct inode *inode) ei->i_size_aligned = i_size_read(inode); ei->i_size_ondisk = i_size_read(inode); - exfat_save_attr(inode, ATTR_SUBDIR); - inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - current_time(inode); - exfat_truncate_atime(&inode->i_atime); + exfat_save_attr(inode, EXFAT_ATTR_SUBDIR); + ei->i_crtime = simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); return 0; } @@ -710,9 +707,6 @@ free_table: check_nls_io: unload_nls(sbi->nls_io); - exfat_free_iocharset(sbi); - sb->s_fs_info = NULL; - kfree(sbi); return err; } @@ -721,14 +715,18 @@ static int exfat_get_tree(struct fs_context *fc) return get_tree_bdev(fc, exfat_fill_super); } +static void exfat_free_sbi(struct exfat_sb_info *sbi) +{ + exfat_free_iocharset(sbi); + kfree(sbi); +} + static void exfat_free(struct fs_context *fc) { struct exfat_sb_info *sbi = fc->s_fs_info; - if (sbi) { - exfat_free_iocharset(sbi); - kfree(sbi); - } + if (sbi) + exfat_free_sbi(sbi); } static int exfat_reconfigure(struct fs_context *fc) @@ -773,12 +771,21 @@ static int exfat_init_fs_context(struct fs_context *fc) return 0; } +static void exfat_kill_sb(struct super_block *sb) +{ + struct exfat_sb_info *sbi = sb->s_fs_info; + + kill_block_super(sb); + if (sbi) + exfat_free_sbi(sbi); +} + static struct file_system_type exfat_fs_type = { .owner = THIS_MODULE, .name = "exfat", .init_fs_context = exfat_init_fs_context, .parameters = exfat_parameters, - .kill_sb = kill_block_super, + .kill_sb = exfat_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index d1dbe47c7975..3ae0154c5680 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -342,43 +342,30 @@ out: return error; } +#define FILEID_INO64_GEN_LEN 3 + /** - * export_encode_fh - default export_operations->encode_fh function + * exportfs_encode_ino64_fid - encode non-decodeable 64bit ino file id * @inode: the object to encode * @fid: where to store the file handle fragment - * @max_len: maximum length to store there - * @parent: parent directory inode, if wanted + * @max_len: maximum length to store there (in 4 byte units) * - * This default encode_fh function assumes that the 32 inode number - * is suitable for locating an inode, and that the generation number - * can be used to check that it is still valid. It places them in the - * filehandle fragment where export_decode_fh expects to find them. + * This generic function is used to encode a non-decodeable file id for + * fanotify for filesystems that do not support NFS export. */ -static int export_encode_fh(struct inode *inode, struct fid *fid, - int *max_len, struct inode *parent) +static int exportfs_encode_ino64_fid(struct inode *inode, struct fid *fid, + int *max_len) { - int len = *max_len; - int type = FILEID_INO32_GEN; - - if (parent && (len < 4)) { - *max_len = 4; - return FILEID_INVALID; - } else if (len < 2) { - *max_len = 2; + if (*max_len < FILEID_INO64_GEN_LEN) { + *max_len = FILEID_INO64_GEN_LEN; return FILEID_INVALID; } - len = 2; - fid->i32.ino = inode->i_ino; - fid->i32.gen = inode->i_generation; - if (parent) { - fid->i32.parent_ino = parent->i_ino; - fid->i32.parent_gen = parent->i_generation; - len = 4; - type = FILEID_INO32_GEN_PARENT; - } - *max_len = len; - return type; + fid->i64.ino = inode->i_ino; + fid->i64.gen = inode->i_generation; + *max_len = FILEID_INO64_GEN_LEN; + + return FILEID_INO64_GEN; } /** @@ -386,6 +373,7 @@ static int export_encode_fh(struct inode *inode, struct fid *fid, * @inode: the object to encode * @fid: where to store the file handle fragment * @max_len: maximum length to store there + * @parent: parent directory inode, if wanted * @flags: properties of the requested file handle * * Returns an enum fid_type or a negative errno. @@ -395,17 +383,13 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, { const struct export_operations *nop = inode->i_sb->s_export_op; - /* - * If a decodeable file handle was requested, we need to make sure that - * filesystem can decode file handles. - */ - if (nop && !(flags & EXPORT_FH_FID) && !nop->fh_to_dentry) + if (!exportfs_can_encode_fh(nop, flags)) return -EOPNOTSUPP; - if (nop && nop->encode_fh) - return nop->encode_fh(inode, fid->raw, max_len, parent); + if (!nop && (flags & EXPORT_FH_FID)) + return exportfs_encode_ino64_fid(inode, fid, max_len); - return export_encode_fh(inode, fid, max_len, parent); + return nop->encode_fh(inode, fid->raw, max_len, parent); } EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); @@ -455,7 +439,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, /* * Try to get any dentry for the given file handle from the filesystem. */ - if (!nop || !nop->fh_to_dentry) + if (!exportfs_can_decode_fh(nop)) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (IS_ERR_OR_NULL(result)) diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 77393fda99af..74d98965902e 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select BUFFER_HEAD select FS_IOMAP select LEGACY_DIRECT_IO help diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 82b17d7fc93f..7e54c31589c7 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -237,7 +237,7 @@ ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, error = __ext2_set_acl(inode, acl, type); if (!error && update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } return error; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index eca60b747c6b..e124f3d709b2 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -36,8 +36,6 @@ */ -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh) @@ -474,8 +472,8 @@ void ext2_discard_reservation(struct inode *inode) * @block: start physical block to free * @count: number of blocks to free */ -void ext2_free_blocks (struct inode * inode, unsigned long block, - unsigned long count) +void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block, + unsigned long count) { struct buffer_head *bitmap_bh = NULL; struct buffer_head * bh2; @@ -718,36 +716,34 @@ fail_access: } /** - * find_next_reservable_window(): - * find a reservable space within the given range. - * It does not allocate the reservation window for now: - * alloc_new_reservation() will do the work later. - * - * @search_head: the head of the searching list; - * This is not necessarily the list head of the whole filesystem + * find_next_reservable_window - Find a reservable space within the given range. + * @search_head: The list to search. + * @my_rsv: The reservation we're currently using. + * @sb: The super block. + * @start_block: The first block we consider to start the real search from + * @last_block: The maximum block number that our goal reservable space + * could start from. * - * We have both head and start_block to assist the search - * for the reservable space. The list starts from head, - * but we will shift to the place where start_block is, - * then start from there, when looking for a reservable space. + * It does not allocate the reservation window: alloc_new_reservation() + * will do the work later. * - * @sb: the super block. + * We search the given range, rather than the whole reservation double + * linked list, (start_block, last_block) to find a free region that is + * of my size and has not been reserved. * - * @start_block: the first block we consider to start the real search from + * @search_head is not necessarily the list head of the whole filesystem. + * We have both head and @start_block to assist the search for the + * reservable space. The list starts from head, but we will shift to + * the place where start_block is, then start from there, when looking + * for a reservable space. * - * @last_block: - * the maximum block number that our goal reservable space - * could start from. This is normally the last block in this - * group. The search will end when we found the start of next - * possible reservable space is out of this boundary. - * This could handle the cross boundary reservation window - * request. - * - * basically we search from the given range, rather than the whole - * reservation double linked list, (start_block, last_block) - * to find a free region that is of my size and has not - * been reserved. + * @last_block is normally the last block in this group. The search will end + * when we found the start of next possible reservable space is out + * of this boundary. This could handle the cross boundary reservation + * window request. * + * Return: -1 if we could not find a range of sufficient size. If we could, + * return 0 and fill in @my_rsv with the range information. */ static int find_next_reservable_window( struct ext2_reserve_window_node *search_head, @@ -835,41 +831,34 @@ static int find_next_reservable_window( } /** - * alloc_new_reservation()--allocate a new reservation window - * - * To make a new reservation, we search part of the filesystem - * reservation list (the list that inside the group). We try to - * allocate a new reservation window near the allocation goal, - * or the beginning of the group, if there is no goal. + * alloc_new_reservation - Allocate a new reservation window. + * @my_rsv: The reservation we're currently using. + * @grp_goal: The goal block relative to the start of the group. + * @sb: The super block. + * @group: The group we are trying to allocate in. + * @bitmap_bh: The block group block bitmap. * - * We first find a reservable space after the goal, then from - * there, we check the bitmap for the first free block after - * it. If there is no free block until the end of group, then the - * whole group is full, we failed. Otherwise, check if the free - * block is inside the expected reservable space, if so, we - * succeed. - * If the first free block is outside the reservable space, then - * start from the first free block, we search for next available - * space, and go on. + * To make a new reservation, we search part of the filesystem reservation + * list (the list inside the group). We try to allocate a new + * reservation window near @grp_goal, or the beginning of the + * group, if @grp_goal is negative. * - * on succeed, a new reservation will be found and inserted into the list - * It contains at least one free block, and it does not overlap with other - * reservation windows. + * We first find a reservable space after the goal, then from there, + * we check the bitmap for the first free block after it. If there is + * no free block until the end of group, then the whole group is full, + * we failed. Otherwise, check if the free block is inside the expected + * reservable space, if so, we succeed. * - * failed: we failed to find a reservation window in this group + * If the first free block is outside the reservable space, then start + * from the first free block, we search for next available space, and + * go on. * - * @my_rsv: the reservation - * - * @grp_goal: The goal (group-relative). It is where the search for a - * free reservable space should start from. - * if we have a goal(goal >0 ), then start from there, - * no goal(goal = -1), we start from the first block - * of the group. - * - * @sb: the super block - * @group: the group we are trying to allocate in - * @bitmap_bh: the block group block bitmap + * on succeed, a new reservation will be found and inserted into the + * list. It contains at least one free block, and it does not overlap + * with other reservation windows. * + * Return: 0 on success, -1 if we failed to find a reservation window + * in this group */ static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv, ext2_grpblk_t grp_goal, struct super_block *sb, @@ -1133,8 +1122,13 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group, if ((my_rsv->rsv_start > group_last_block) || (my_rsv->rsv_end < group_first_block)) { + ext2_error(sb, __func__, + "Reservation out of group %u range goal %d fsb[%lu,%lu] rsv[%lu, %lu]", + group, grp_goal, group_first_block, + group_last_block, my_rsv->rsv_start, + my_rsv->rsv_end); rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1); - BUG(); + return -1; } ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal, &num, &my_rsv->rsv_window); @@ -1195,6 +1189,7 @@ int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, * @goal: given target block(filesystem wide) * @count: target number of blocks to allocate * @errp: error code + * @flags: allocate flags * * ext2_new_blocks uses a goal block to assist allocation. If the goal is * free, or there is a free block within 32 blocks of the goal, that block @@ -1204,7 +1199,7 @@ int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, * This function also updates quota and i_blocks field. */ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, - unsigned long *count, int *errp) + unsigned long *count, int *errp, unsigned int flags) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gdp_bh; @@ -1243,15 +1238,15 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal, es = EXT2_SB(sb)->s_es; ext2_debug("goal=%lu.\n", goal); /* - * Allocate a block from reservation only when - * filesystem is mounted with reservation(default,-o reservation), and - * it's a regular file, and - * the desired window size is greater than 0 (One could use ioctl - * command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn off - * reservation on that particular file) + * Allocate a block from reservation only when the filesystem is + * mounted with reservation(default,-o reservation), and it's a regular + * file, and the desired window size is greater than 0 (One could use + * ioctl command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn + * off reservation on that particular file). Also do not use the + * reservation window if the caller asked us not to do it. */ block_i = EXT2_I(inode)->i_block_alloc_info; - if (block_i) { + if (!(flags & EXT2_ALLOC_NORESERVE) && block_i) { windowsz = block_i->rsv_window_node.rsv_goal_size; if (windowsz > 0) my_rsv = &block_i->rsv_window_node; @@ -1431,13 +1426,6 @@ out: return 0; } -ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp) -{ - unsigned long count = 1; - - return ext2_new_blocks(inode, goal, &count, errp); -} - #ifdef EXT2FS_DEBUG unsigned long ext2_count_free(struct buffer_head *map, unsigned int numchars) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 42db804794bd..4fb155b5a958 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -81,34 +81,34 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static void ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) +static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; inode_inc_iversion(dir); - block_write_end(NULL, mapping, pos, len, len, page, NULL); + block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } - unlock_page(page); + folio_unlock(folio); } -static bool ext2_check_page(struct page *page, int quiet, char *kaddr) +static bool ext2_check_folio(struct folio *folio, int quiet, char *kaddr) { - struct inode *dir = page->mapping->host; + struct inode *dir = folio->mapping->host; struct super_block *sb = dir->i_sb; unsigned chunk_size = ext2_chunk_size(dir); u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count); unsigned offs, rec_len; - unsigned limit = PAGE_SIZE; + unsigned limit = folio_size(folio); ext2_dirent *p; char *error; - if ((dir->i_size >> PAGE_SHIFT) == page->index) { - limit = dir->i_size & ~PAGE_MASK; + if (dir->i_size < folio_pos(folio) + limit) { + limit = offset_in_folio(folio, dir->i_size); if (limit & (chunk_size - 1)) goto Ebadsize; if (!limit) @@ -132,7 +132,7 @@ static bool ext2_check_page(struct page *page, int quiet, char *kaddr) if (offs != limit) goto Eend; out: - SetPageChecked(page); + folio_set_checked(folio); return true; /* Too bad, we had an error */ @@ -160,51 +160,52 @@ Einumber: bad_entry: if (!quiet) ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, + "offset=%llu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, folio_pos(folio) + offs, (unsigned long) le32_to_cpu(p->inode), rec_len, p->name_len); goto fail; Eend: if (!quiet) { p = (ext2_dirent *)(kaddr + offs); - ext2_error(sb, "ext2_check_page", + ext2_error(sb, "ext2_check_folio", "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_SHIFT)+offs, + "offset=%llu, inode=%lu", + dir->i_ino, folio_pos(folio) + offs, (unsigned long) le32_to_cpu(p->inode)); } fail: - SetPageError(page); + folio_set_error(folio); return false; } /* - * Calls to ext2_get_page()/ext2_put_page() must be nested according to the - * rules documented in kmap_local_page()/kunmap_local(). + * Calls to ext2_get_folio()/folio_release_kmap() must be nested according + * to the rules documented in kmap_local_folio()/kunmap_local(). * - * NOTE: ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() - * and should be treated as a call to ext2_get_page() for nesting purposes. + * NOTE: ext2_find_entry() and ext2_dotdot() act as a call + * to folio_release_kmap() and should be treated as a call to + * folio_release_kmap() for nesting purposes. */ -static void *ext2_get_page(struct inode *dir, unsigned long n, - int quiet, struct page **page) +static void *ext2_get_folio(struct inode *dir, unsigned long n, + int quiet, struct folio **foliop) { struct address_space *mapping = dir->i_mapping; struct folio *folio = read_mapping_folio(mapping, n, NULL); - void *page_addr; + void *kaddr; if (IS_ERR(folio)) return ERR_CAST(folio); - page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1)); + kaddr = kmap_local_folio(folio, 0); if (unlikely(!folio_test_checked(folio))) { - if (!ext2_check_page(&folio->page, quiet, page_addr)) + if (!ext2_check_folio(folio, quiet, kaddr)) goto fail; } - *page = &folio->page; - return page_addr; + *foliop = folio; + return kaddr; fail: - ext2_put_page(&folio->page, page_addr); + folio_release_kmap(folio, kaddr); return ERR_PTR(-EIO); } @@ -274,8 +275,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx) for ( ; n < npages; n++, offset = 0) { ext2_dirent *de; - struct page *page; - char *kaddr = ext2_get_page(inode, n, 0, &page); + struct folio *folio; + char *kaddr = ext2_get_folio(inode, n, 0, &folio); char *limit; if (IS_ERR(kaddr)) { @@ -299,7 +300,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (de->rec_len == 0) { ext2_error(sb, __func__, "zero-length directory entry"); - ext2_put_page(page, de); + folio_release_kmap(folio, de); return -EIO; } if (de->inode) { @@ -311,13 +312,13 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), d_type)) { - ext2_put_page(page, de); + folio_release_kmap(folio, de); return 0; } } ctx->pos += ext2_rec_len_from_disk(de->rec_len); } - ext2_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 0; } @@ -330,38 +331,35 @@ ext2_readdir(struct file *file, struct dir_context *ctx) * and the entry itself. Page is returned mapped and unlocked. * Entry is guaranteed to be valid. * - * On Success ext2_put_page() should be called on *res_page. + * On Success folio_release_kmap() should be called on *foliop. * - * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to - * the rules documented in kmap_local_page()/kunmap_local(). + * NOTE: Calls to ext2_get_folio()/folio_release_kmap() must be nested + * according to the rules documented in kmap_local_folio()/kunmap_local(). * - * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and - * should be treated as a call to ext2_get_page() for nesting purposes. + * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_folio() + * and should be treated as a call to ext2_get_folio() for nesting + * purposes. */ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, - const struct qstr *child, struct page **res_page) + const struct qstr *child, struct folio **foliop) { const char *name = child->name; int namelen = child->len; unsigned reclen = EXT2_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); - struct page *page = NULL; struct ext2_inode_info *ei = EXT2_I(dir); ext2_dirent * de; if (npages == 0) goto out; - /* OFFSET_CACHE */ - *res_page = NULL; - start = ei->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr = ext2_get_page(dir, n, 0, &page); + char *kaddr = ext2_get_folio(dir, n, 0, foliop); if (IS_ERR(kaddr)) return ERR_CAST(kaddr); @@ -371,18 +369,18 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, if (de->rec_len == 0) { ext2_error(dir->i_sb, __func__, "zero-length directory entry"); - ext2_put_page(page, de); + folio_release_kmap(*foliop, de); goto out; } if (ext2_match(namelen, name, de)) goto found; de = ext2_next_entry(de); } - ext2_put_page(page, kaddr); + folio_release_kmap(*foliop, kaddr); if (++n >= npages) n = 0; - /* next page is past the blocks we've got */ + /* next folio is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { ext2_error(dir->i_sb, __func__, "dir %lu size %lld exceeds block count %llu", @@ -395,7 +393,6 @@ out: return ERR_PTR(-ENOENT); found: - *res_page = page; ei->i_dir_start_lookup = n; return de; } @@ -404,17 +401,18 @@ found: * Return the '..' directory entry and the page in which the entry was found * (as a parameter - p). * - * On Success ext2_put_page() should be called on *p. + * On Success folio_release_kmap() should be called on *foliop. * - * NOTE: Calls to ext2_get_page()/ext2_put_page() must be nested according to - * the rules documented in kmap_local_page()/kunmap_local(). + * NOTE: Calls to ext2_get_folio()/folio_release_kmap() must be nested + * according to the rules documented in kmap_local_folio()/kunmap_local(). * - * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and - * should be treated as a call to ext2_get_page() for nesting purposes. + * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_folio() + * and should be treated as a call to ext2_get_folio() for nesting + * purposes. */ -struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p) +struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct folio **foliop) { - ext2_dirent *de = ext2_get_page(dir, 0, 0, p); + ext2_dirent *de = ext2_get_folio(dir, 0, 0, foliop); if (!IS_ERR(de)) return ext2_next_entry(de); @@ -424,23 +422,22 @@ struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p) int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino) { struct ext2_dir_entry_2 *de; - struct page *page; - - de = ext2_find_entry(dir, child, &page); + struct folio *folio; + + de = ext2_find_entry(dir, child, &folio); if (IS_ERR(de)) return PTR_ERR(de); *ino = le32_to_cpu(de->inode); - ext2_put_page(page, de); + folio_release_kmap(folio, de); return 0; } -static int ext2_prepare_chunk(struct page *page, loff_t pos, unsigned len) +static int ext2_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) { - return __block_write_begin(page, pos, len, ext2_get_block); + return __block_write_begin(&folio->page, pos, len, ext2_get_block); } - static int ext2_handle_dirsync(struct inode *dir) { int err; @@ -452,23 +449,23 @@ static int ext2_handle_dirsync(struct inode *dir) } int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, struct inode *inode, bool update_times) + struct folio *folio, struct inode *inode, bool update_times) { - loff_t pos = page_offset(page) + offset_in_page(de); + loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); unsigned len = ext2_rec_len_from_disk(de->rec_len); int err; - lock_page(page); - err = ext2_prepare_chunk(page, pos, len); + folio_lock(folio); + err = ext2_prepare_chunk(folio, pos, len); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type(de, inode); - ext2_commit_chunk(page, pos, len); + ext2_commit_chunk(folio, pos, len); if (update_times) - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); return ext2_handle_dirsync(dir); @@ -485,7 +482,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) unsigned chunk_size = ext2_chunk_size(dir); unsigned reclen = EXT2_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; - struct page *page = NULL; + struct folio *folio = NULL; ext2_dirent * de; unsigned long npages = dir_pages(dir); unsigned long n; @@ -494,19 +491,19 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) /* * We take care of directory expansion in the same loop. - * This code plays outside i_size, so it locks the page + * This code plays outside i_size, so it locks the folio * to protect that region. */ for (n = 0; n <= npages; n++) { - char *kaddr = ext2_get_page(dir, n, 0, &page); + char *kaddr = ext2_get_folio(dir, n, 0, &folio); char *dir_end; if (IS_ERR(kaddr)) return PTR_ERR(kaddr); - lock_page(page); + folio_lock(folio); dir_end = kaddr + ext2_last_byte(dir, n); de = (ext2_dirent *)kaddr; - kaddr += PAGE_SIZE - reclen; + kaddr += folio_size(folio) - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ @@ -533,15 +530,15 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) goto got_it; de = (ext2_dirent *) ((char *) de + rec_len); } - unlock_page(page); - ext2_put_page(page, kaddr); + folio_unlock(folio); + folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + offset_in_page(de); - err = ext2_prepare_chunk(page, pos, rec_len); + pos = folio_pos(folio) + offset_in_folio(folio, de); + err = ext2_prepare_chunk(folio, pos, rec_len); if (err) goto out_unlock; if (de->inode) { @@ -554,17 +551,17 @@ got_it: memcpy(de->name, name, namelen); de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); - ext2_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); + ext2_commit_chunk(folio, pos, rec_len); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); err = ext2_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: - ext2_put_page(page, de); + folio_release_kmap(folio, de); return err; out_unlock: - unlock_page(page); + folio_unlock(folio); goto out_put; } @@ -572,18 +569,21 @@ out_unlock: * ext2_delete_entry deletes a directory entry by merging it with the * previous entry. Page is up-to-date. */ -int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page) +int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct folio *folio) { - struct inode *inode = page->mapping->host; - char *kaddr = (char *)((unsigned long)dir & PAGE_MASK); - unsigned from = offset_in_page(dir) & ~(ext2_chunk_size(inode)-1); - unsigned to = offset_in_page(dir) + - ext2_rec_len_from_disk(dir->rec_len); + struct inode *inode = folio->mapping->host; + size_t from, to; + char *kaddr; loff_t pos; - ext2_dirent *pde = NULL; - ext2_dirent *de = (ext2_dirent *)(kaddr + from); + ext2_dirent *de, *pde = NULL; int err; + from = offset_in_folio(folio, dir); + to = from + ext2_rec_len_from_disk(dir->rec_len); + kaddr = (char *)dir - from; + from &= ~(ext2_chunk_size(inode)-1); + de = (ext2_dirent *)(kaddr + from); + while ((char*)de < (char*)dir) { if (de->rec_len == 0) { ext2_error(inode->i_sb, __func__, @@ -594,19 +594,19 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page) de = ext2_next_entry(de); } if (pde) - from = offset_in_page(pde); - pos = page_offset(page) + from; - lock_page(page); - err = ext2_prepare_chunk(page, pos, to - from); + from = offset_in_folio(folio, pde); + pos = folio_pos(folio) + from; + folio_lock(folio); + err = ext2_prepare_chunk(folio, pos, to - from); if (err) { - unlock_page(page); + folio_unlock(folio); return err; } if (pde) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; - ext2_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); + ext2_commit_chunk(folio, pos, to - from); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); return ext2_handle_dirsync(inode); @@ -617,21 +617,21 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page) */ int ext2_make_empty(struct inode *inode, struct inode *parent) { - struct page *page = grab_cache_page(inode->i_mapping, 0); + struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); unsigned chunk_size = ext2_chunk_size(inode); struct ext2_dir_entry_2 * de; int err; void *kaddr; - if (!page) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); - err = ext2_prepare_chunk(page, 0, chunk_size); + err = ext2_prepare_chunk(folio, 0, chunk_size); if (err) { - unlock_page(page); + folio_unlock(folio); goto fail; } - kaddr = kmap_local_page(page); + kaddr = kmap_local_folio(folio, 0); memset(kaddr, 0, chunk_size); de = (struct ext2_dir_entry_2 *)kaddr; de->name_len = 1; @@ -647,26 +647,26 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) memcpy (de->name, "..\0", 4); ext2_set_de_type (de, inode); kunmap_local(kaddr); - ext2_commit_chunk(page, 0, chunk_size); + ext2_commit_chunk(folio, 0, chunk_size); err = ext2_handle_dirsync(inode); fail: - put_page(page); + folio_put(folio); return err; } /* * routine to check that the specified directory is empty (for rmdir) */ -int ext2_empty_dir (struct inode * inode) +int ext2_empty_dir(struct inode *inode) { - struct page *page; + struct folio *folio; char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { ext2_dirent *de; - kaddr = ext2_get_page(inode, i, 0, &page); + kaddr = ext2_get_folio(inode, i, 0, &folio); if (IS_ERR(kaddr)) return 0; @@ -695,12 +695,12 @@ int ext2_empty_dir (struct inode * inode) } de = ext2_next_entry(de); } - ext2_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); } return 1; not_empty: - ext2_put_page(page, kaddr); + folio_release_kmap(folio, kaddr); return 0; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 35a041c47c38..677a9ad45dcb 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -399,6 +399,12 @@ struct ext2_inode { #define EXT2_ERRORS_DEFAULT EXT2_ERRORS_CONTINUE /* + * Allocation flags + */ +#define EXT2_ALLOC_NORESERVE 0x1 /* Do not use reservation + * window for allocation */ + +/* * Structure of the super block */ struct ext2_super_block { @@ -695,13 +701,11 @@ static inline struct ext2_inode_info *EXT2_I(struct inode *inode) /* balloc.c */ extern int ext2_bg_has_super(struct super_block *sb, int group); extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group); -extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *); -extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long, - unsigned long *, int *); +extern ext2_fsblk_t ext2_new_blocks(struct inode *, ext2_fsblk_t, + unsigned long *, int *, unsigned int); extern int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, unsigned int count); -extern void ext2_free_blocks (struct inode *, unsigned long, - unsigned long); +extern void ext2_free_blocks(struct inode *, ext2_fsblk_t, unsigned long); extern unsigned long ext2_count_free_blocks (struct super_block *); extern unsigned long ext2_count_dirs (struct super_block *); extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, @@ -713,22 +717,17 @@ extern void ext2_init_block_alloc_info(struct inode *); extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv); /* dir.c */ -extern int ext2_add_link (struct dentry *, struct inode *); -extern int ext2_inode_by_name(struct inode *dir, +int ext2_add_link(struct dentry *, struct inode *); +int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino); -extern int ext2_make_empty(struct inode *, struct inode *); -extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *, - struct page **); -extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page); -extern int ext2_empty_dir (struct inode *); -extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p); +int ext2_make_empty(struct inode *, struct inode *); +struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *, + struct folio **foliop); +int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct folio *folio); +int ext2_empty_dir(struct inode *); +struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct folio **foliop); int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, struct inode *inode, bool update_times); -static inline void ext2_put_page(struct page *page, void *page_addr) -{ - kunmap_local(page_addr); - put_page(page); -} + struct folio *folio, struct inode *inode, bool update_times); /* ialloc.c */ extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0b4c91c62e1f..4ddc36f4dbd4 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -103,7 +103,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) } filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) @@ -258,7 +258,6 @@ static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out_unlock; } - iocb->ki_pos += status; ret += status; endbyte = pos + status - 1; ret2 = filemap_write_and_wait_range(inode->i_mapping, pos, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index a4e1d7a9c544..fdf63e9c6e7c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -273,7 +273,6 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) if ((parent == d_inode(sb->s_root)) || (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) { - struct ext2_group_desc *best_desc = NULL; int best_ndir = inodes_per_group; int best_group = -1; @@ -291,10 +290,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) continue; best_group = group; best_ndir = le16_to_cpu(desc->bg_used_dirs_count); - best_desc = desc; } if (best_group >= 0) { - desc = best_desc; group = best_group; goto found; } @@ -549,7 +546,7 @@ got: inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_flags = ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 75983215c7a1..464faf6c217e 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -385,12 +385,16 @@ ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks, } /** - * ext2_alloc_blocks: multiple allocate blocks needed for a branch - * @indirect_blks: the number of blocks need to allocate for indirect - * blocks - * @blks: the number of blocks need to allocate for direct blocks - * @new_blocks: on return it will store the new block numbers for - * the indirect blocks(if needed) and the first direct block, + * ext2_alloc_blocks: Allocate multiple blocks needed for a branch. + * @inode: Owner. + * @goal: Preferred place for allocation. + * @indirect_blks: The number of blocks needed to allocate for indirect blocks. + * @blks: The number of blocks need to allocate for direct blocks. + * @new_blocks: On return it will store the new block numbers for + * the indirect blocks(if needed) and the first direct block. + * @err: Error pointer. + * + * Return: Number of blocks allocated. */ static int ext2_alloc_blocks(struct inode *inode, ext2_fsblk_t goal, int indirect_blks, int blks, @@ -415,7 +419,7 @@ static int ext2_alloc_blocks(struct inode *inode, while (1) { count = target; /* allocating blocks for indirect blocks and direct blocks */ - current_block = ext2_new_blocks(inode,goal,&count,err); + current_block = ext2_new_blocks(inode, goal, &count, err, 0); if (*err) goto failed_out; @@ -595,7 +599,7 @@ static void ext2_splice_branch(struct inode *inode, if (where->bh) mark_buffer_dirty_inode(where->bh, inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } @@ -1082,8 +1086,8 @@ no_top: */ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q) { - unsigned long block_to_free = 0, count = 0; - unsigned long nr; + ext2_fsblk_t block_to_free = 0, count = 0; + ext2_fsblk_t nr; for ( ; p < q ; p++) { nr = le32_to_cpu(*p); @@ -1123,7 +1127,7 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q) static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth) { struct buffer_head * bh; - unsigned long nr; + ext2_fsblk_t nr; if (depth--) { int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); @@ -1287,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) __ext2_truncate_blocks(inode, newsize); filemap_invalidate_unlock(inode->i_mapping); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); sync_inode_metadata(inode, 1); @@ -1408,10 +1412,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode_set_atime(inode, (signed)le32_to_cpu(raw_inode->i_atime), 0); + inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0); + inode_set_mtime(inode, (signed)le32_to_cpu(raw_inode->i_mtime), 0); ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes @@ -1540,9 +1543,9 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(inode->i_size); - raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); - raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + raw_inode->i_atime = cpu_to_le32(inode_get_atime_sec(inode)); + raw_inode->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode)); + raw_inode->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode)); raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); @@ -1628,7 +1631,7 @@ int ext2_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index cc87d413eb43..44e04484e570 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -44,7 +44,7 @@ int ext2_fileattr_set(struct mnt_idmap *idmap, (fa->flags & EXT2_FL_USER_MODIFIABLE); ext2_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; @@ -77,7 +77,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } inode_lock(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode->i_generation = generation; inode_unlock(inode); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 937dd8f60f96..65f702b1da5b 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -211,7 +211,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (err) return err; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -273,25 +273,25 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct ext2_dir_entry_2 *de; - struct page *page; + struct folio *folio; int err; err = dquot_initialize(dir); if (err) goto out; - de = ext2_find_entry(dir, &dentry->d_name, &page); + de = ext2_find_entry(dir, &dentry->d_name, &folio); if (IS_ERR(de)) { err = PTR_ERR(de); goto out; } - err = ext2_delete_entry(de, page); - ext2_put_page(page, de); + err = ext2_delete_entry(de, folio); + folio_release_kmap(folio, de); if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); err = 0; out: @@ -321,9 +321,9 @@ static int ext2_rename (struct mnt_idmap * idmap, { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); - struct page * dir_page = NULL; + struct folio *dir_folio = NULL; struct ext2_dir_entry_2 * dir_de = NULL; - struct page * old_page; + struct folio * old_folio; struct ext2_dir_entry_2 * old_de; int err; @@ -338,19 +338,19 @@ static int ext2_rename (struct mnt_idmap * idmap, if (err) return err; - old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (IS_ERR(old_de)) return PTR_ERR(old_de); if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = ext2_dotdot(old_inode, &dir_page); + dir_de = ext2_dotdot(old_inode, &dir_folio); if (!dir_de) goto out_old; } if (new_inode) { - struct page *new_page; + struct folio *new_folio; struct ext2_dir_entry_2 *new_de; err = -ENOTEMPTY; @@ -358,16 +358,16 @@ static int ext2_rename (struct mnt_idmap * idmap, goto out_dir; new_de = ext2_find_entry(new_dir, &new_dentry->d_name, - &new_page); + &new_folio); if (IS_ERR(new_de)) { err = PTR_ERR(new_de); goto out_dir; } - err = ext2_set_link(new_dir, new_de, new_page, old_inode, true); - ext2_put_page(new_page, new_de); + err = ext2_set_link(new_dir, new_de, new_folio, old_inode, true); + folio_release_kmap(new_folio, new_de); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -383,22 +383,22 @@ static int ext2_rename (struct mnt_idmap * idmap, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); - err = ext2_delete_entry(old_de, old_page); + err = ext2_delete_entry(old_de, old_folio); if (!err && dir_de) { if (old_dir != new_dir) - err = ext2_set_link(old_inode, dir_de, dir_page, + err = ext2_set_link(old_inode, dir_de, dir_folio, new_dir, false); inode_dec_link_count(old_dir); } out_dir: if (dir_de) - ext2_put_page(dir_page, dir_de); + folio_release_kmap(dir_folio, dir_de); out_old: - ext2_put_page(old_page, old_de); + folio_release_kmap(old_folio, old_de); return err; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2959afc7541c..01f9addc8b1f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -397,6 +397,7 @@ static struct dentry *ext2_fh_to_parent(struct super_block *sb, struct fid *fid, } static const struct export_operations ext2_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext2_fh_to_dentry, .fh_to_parent = ext2_fh_to_parent, .get_parent = ext2_get_parent, @@ -1572,7 +1573,7 @@ out: if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 8906ba479aaf..e849241ebb8f 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -98,7 +98,7 @@ static struct buffer_head *ext2_xattr_cache_find(struct inode *, static void ext2_xattr_rehash(struct ext2_xattr_header *, struct ext2_xattr_entry *); -static const struct xattr_handler *ext2_xattr_handler_map[] = { +static const struct xattr_handler * const ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, @@ -110,7 +110,7 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = { #endif }; -const struct xattr_handler *ext2_xattr_handlers[] = { +const struct xattr_handler * const ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_SECURITY @@ -742,10 +742,13 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, /* We need to allocate a new block */ ext2_fsblk_t goal = ext2_group_first_block_no(sb, EXT2_I(inode)->i_block_group); - int block = ext2_new_block(inode, goal, &error); + unsigned long count = 1; + ext2_fsblk_t block = ext2_new_blocks(inode, goal, + &count, &error, + EXT2_ALLOC_NORESERVE); if (error) goto cleanup; - ea_idebug(inode, "creating block %d", block); + ea_idebug(inode, "creating block %lu", block); new_bh = sb_getblk(sb, block); if (unlikely(!new_bh)) { @@ -773,7 +776,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, /* Update the inode. */ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) { error = sync_inode_metadata(inode, 1); /* In case sync failed due to ENOSPC the inode was actually diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 7925f596e8e2..6a4966949047 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -72,7 +72,7 @@ extern void ext2_xattr_delete_inode(struct inode *); extern struct mb_cache *ext2_xattr_create_cache(void); extern void ext2_xattr_destroy_cache(struct mb_cache *cache); -extern const struct xattr_handler *ext2_xattr_handlers[]; +extern const struct xattr_handler * const ext2_xattr_handlers[]; # else /* CONFIG_EXT2_FS_XATTR */ diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 86699c8cab28..e20d59221fc0 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -28,6 +28,7 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" + select BUFFER_HEAD select JBD2 select CRC16 select CRYPTO diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 27fcbddfb148..3bffe862f954 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -259,7 +259,7 @@ retry: error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 0c5a79c3b5d4..ef4c19e5f570 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { + /* usually, the umask is applied by posix_acl_create(), but if + ext4 ACL support is disabled at compile time, we need to do + it here, because posix_acl_create() will never be called */ + inode->i_mode &= ~current_umask(); + return 0; } #endif /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1f72f977c6db..591fb3f710be 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -22,6 +22,7 @@ #include "mballoc.h" #include <trace/events/ext4.h> +#include <kunit/static_stub.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); @@ -111,10 +112,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { - itbl_blk_start = itbl_blk_start >= start ? - itbl_blk_start : start; - itbl_blk_end = itbl_blk_end <= end ? - itbl_blk_end : end; + itbl_blk_start = max(itbl_blk_start, start); + itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); @@ -274,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; + KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, + sb, block_group, bh); + if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); @@ -468,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, ext4_fsblk_t bitmap_blk; int err; + KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, + sb, block_group, ignore_locked); + desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); @@ -563,6 +568,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, { struct ext4_group_desc *desc; + KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, + sb, block_group, bh); + if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -913,11 +921,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } /* - * This function returns the number of file system metadata clusters at + * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group) +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; @@ -935,8 +943,15 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } - return EXT4_NUM_B2C(sbi, num); + return num; } + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5504f72bbbbe..6fe3c941b565 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_block *sb) struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; - int flex_size = ext4_flex_bg_size(sbi); int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); @@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_block *sb) return -ENOMEM; for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + cond_resched(); - if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) { + if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1, 0); + meta_blks, 0); if (ret) goto err; } diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index e20ac0654b3f..7ae0b61258a7 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -33,6 +33,8 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } @@ -51,6 +53,8 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } @@ -228,19 +232,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb) return ext4_has_feature_stable_inodes(sb); } -static void ext4_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); - *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); -} - const struct fscrypt_operations ext4_cryptops = { - .key_prefix = "ext4:", + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, - .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a2d55faa095..a5d784872303 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -176,9 +176,6 @@ enum criteria { EXT4_MB_NUM_CRS }; -/* criteria below which we use fast block scanning and avoid unnecessary IO */ -#define CR_FAST CR_GOAL_LEN_SLOW - /* * Flags used in mballoc's allocation_context flags field. * @@ -868,64 +865,80 @@ struct ext4_inode { * affected filesystem before 2242. */ -static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { - u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; - return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); + u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } -static inline void ext4_decode_extra_time(struct timespec64 *time, - __le32 extra) +static inline struct timespec64 ext4_decode_extra_time(__le32 base, + __le32 extra) { + struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) - time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + return ts; } -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(inode)->xtime); \ - } \ - else \ - (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ + (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ + } else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) -#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(einode)->xtime); \ +#define EXT4_INODE_SET_ATIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) + +#define EXT4_INODE_SET_MTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ + raw_inode, (einode)->xtime) + +#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ + (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ + ext4_decode_extra_time((raw_inode)->xtime, \ + (raw_inode)->xtime ## _extra) : \ + (struct timespec64) { \ + .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ + }) + +#define EXT4_INODE_GET_ATIME(inode, raw_inode) \ +do { \ + inode_set_atime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ } while (0) -#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_GET_MTIME(inode, raw_inode) \ do { \ - (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ - ext4_decode_extra_time(&(inode)->xtime, \ - raw_inode->xtime ## _extra); \ - } \ - else \ - (inode)->xtime.tv_nsec = 0; \ + inode_set_mtime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) +#define EXT4_INODE_GET_CTIME(inode, raw_inode) \ +do { \ + inode_set_ctime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ +} while (0) -#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (einode)->xtime.tv_sec = \ - (signed)le32_to_cpu((raw_inode)->xtime); \ - else \ - (einode)->xtime.tv_sec = 0; \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - ext4_decode_extra_time(&(einode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (einode)->xtime.tv_nsec = 0; \ +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime = \ + EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ + raw_inode); \ + else \ + (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version @@ -1235,6 +1248,7 @@ struct ext4_inode_info { #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1252,10 +1266,8 @@ struct ext4_inode_info { #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le -#define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le -#define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le @@ -1492,6 +1504,7 @@ struct ext4_sb_info { loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; @@ -1535,7 +1548,7 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct block_device *s_journal_bdev; + struct bdev_handle *s_journal_bdev_handle; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; @@ -1562,7 +1575,7 @@ struct ext4_sb_info { unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; - struct list_head s_freed_data_list; /* List of blocks to be freed + struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; @@ -1651,7 +1664,7 @@ struct ext4_sb_info { __u32 s_csum_seed; /* Reclaim extents from extent status tree */ - struct shrinker s_es_shrinker; + struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; @@ -1674,7 +1687,8 @@ struct ext4_sb_info { /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA - * or EXTENTS flag. + * or EXTENTS flag or between writepages ops and changing DELALLOC or + * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; @@ -1702,10 +1716,13 @@ struct ext4_sb_info { const char *s_last_error_func; time64_t s_last_error_time; /* - * If we are in a context where we cannot update error information in - * the on-disk superblock, we queue this work to do it. + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. */ - struct work_struct s_error_work; + struct work_struct s_sb_upd_work; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; @@ -1798,7 +1815,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; @@ -2222,9 +2238,9 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); #define EXT4_FLAGS_SHUTDOWN 1 #define EXT4_FLAGS_BDEV_IS_DAX 2 -static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +static inline int ext4_forced_shutdown(struct super_block *sb) { - return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } /* @@ -2702,7 +2718,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); -extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); @@ -2858,7 +2873,6 @@ extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); -extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); @@ -2901,7 +2915,6 @@ extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); @@ -2923,7 +2936,11 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state); + int len, bool state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, @@ -2977,7 +2994,6 @@ extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); @@ -3084,6 +3100,8 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, @@ -3525,8 +3543,6 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); int ext4_readpage_inline(struct inode *inode, struct folio *folio); @@ -3774,8 +3790,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /* For ioend & aio unwritten conversion wait queues */ #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 77f318ec8abb..d1a2e6624401 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -67,11 +67,12 @@ static int ext4_journal_check_start(struct super_block *sb) might_sleep(); - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; - if (sb_rdonly(sb)) + if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* @@ -234,8 +235,7 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - if (bh->b_bdev->bd_super) - ext4_check_bdev_write_error(bh->b_bdev->bd_super); + ext4_check_bdev_write_error(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e4115d338f10..d5efe076d3d3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1010,6 +1010,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ix = curp->p_idx; } + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EFSCORRUPTED; + } + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { @@ -1019,11 +1024,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } - if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { - EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); - return -EFSCORRUPTED; - } - ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); @@ -4476,12 +4476,13 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) - inode->i_mtime = inode->i_ctime; + inode_set_mtime_to_ts(inode, + inode_get_ctime(inode)); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4617,7 +4618,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -4642,7 +4643,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_mutex; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); @@ -5378,7 +5379,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -5488,7 +5489,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; @@ -6080,13 +6081,13 @@ int ext4_ext_clear_bb(struct inode *inode) for (j = 0; j < path->p_depth; j++) { ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 0); + path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } ext4_free_ext_path(path); } - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9b5b8951afb4..4a00e2f019d9 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -152,8 +152,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc); int __init ext4_init_es(void) { @@ -448,6 +449,19 @@ static void ext4_es_list_del(struct inode *inode) spin_unlock(&sbi->s_es_lock); } +static inline struct pending_reservation *__alloc_pending(bool nofail) +{ + if (!nofail) + return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + + return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); +} + +static inline void __free_pending(struct pending_reservation *pr) +{ + kmem_cache_free(ext4_pending_cachep, pr); +} + /* * Returns true if we cannot fail to allocate memory for this extent_status * entry and cannot reclaim it until its status changes. @@ -836,11 +850,12 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; - int err1 = 0; - int err2 = 0; + int err1 = 0, err2 = 0, err3 = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; + struct pending_reservation *pr = NULL; + bool revise_pending = false; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -868,36 +883,53 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); + revise_pending = sbi->s_cluster_ratio > 1 && + test_opt(inode->i_sb, DELALLOC) && + (status & (EXTENT_STATUS_WRITTEN | + EXTENT_STATUS_UNWRITTEN)); retry: if (err1 && !es1) es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); + if ((err1 || err2 || err3) && revise_pending && !pr) + pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } - if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && - (status & EXTENT_STATUS_WRITTEN || - status & EXTENT_STATUS_UNWRITTEN)) - __revise_pending(inode, lblk, len); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); + if (revise_pending) { + err3 = __revise_pending(inode, lblk, len, &pr); + if (err3 != 0) + goto error; + if (pr) { + __free_pending(pr); + pr = NULL; + } + } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2) + if (err1 || err2 || err3) goto retry; ext4_es_print_tree(inode); @@ -1305,7 +1337,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, rc->ndelonly--; node = rb_next(&pr->rb_node); rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); + __free_pending(pr); if (!node) break; pr = rb_entry(node, struct pending_reservation, @@ -1399,8 +1431,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } } if (count_reserved) - count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, - &orig_es, &rc); + count_rsvd(inode, orig_es.es_lblk + len1, + orig_es.es_len - len1 - len2, &orig_es, &rc); goto out_get_reserved; } @@ -1491,8 +1523,12 @@ retry: */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); - if (es && !es->es_len) - __es_free_extent(es); + /* Free preallocated extent if it didn't get used. */ + if (es) { + if (!es->es_len) + __es_free_extent(es); + es = NULL; + } write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; @@ -1596,7 +1632,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, unsigned long nr; struct ext4_sb_info *sbi; - sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; @@ -1605,8 +1641,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); + struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; @@ -1690,13 +1725,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err3; - sbi->s_es_shrinker.scan_objects = ext4_es_scan; - sbi->s_es_shrinker.count_objects = ext4_es_count; - sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", - sbi->s_sb->s_id); - if (err) + sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); + if (!sbi->s_es_shrinker) { + err = -ENOMEM; goto err4; + } + + sbi->s_es_shrinker->scan_objects = ext4_es_scan; + sbi->s_es_shrinker->count_objects = ext4_es_count; + sbi->s_es_shrinker->private_data = sbi; + + shrinker_register(sbi->s_es_shrinker); return 0; err4: @@ -1716,7 +1755,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); - unregister_shrinker(&sbi->s_es_shrinker); + shrinker_free(sbi->s_es_shrinker); } /* @@ -1897,11 +1936,13 @@ static struct pending_reservation *__get_pending(struct inode *inode, * * @inode - file containing the cluster * @lblk - logical block in the cluster to be added + * @prealloc - preallocated pending entry * * Returns 0 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully. */ -static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, + struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; @@ -1927,10 +1968,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) } } - pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); - if (pr == NULL) { - ret = -ENOMEM; - goto out; + if (likely(*prealloc == NULL)) { + pr = __alloc_pending(false); + if (!pr) { + ret = -ENOMEM; + goto out; + } + } else { + pr = *prealloc; + *prealloc = NULL; } pr->lclu = lclu; @@ -1960,7 +2006,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) if (pr != NULL) { tree = &EXT4_I(inode)->i_pending_tree; rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); + __free_pending(pr); } } @@ -2019,10 +2065,10 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated) { struct extent_status newes; - int err1 = 0; - int err2 = 0; + int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; + struct pending_reservation *pr = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -2042,27 +2088,42 @@ retry: es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); + if ((err1 || err2 || err3) && allocated && !pr) + pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } - if (allocated) - __insert_pending(inode, lblk); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); + if (allocated) { + err3 = __insert_pending(inode, lblk, &pr); + if (err3 != 0) + goto error; + if (pr) { + __free_pending(pr); + pr = NULL; + } + } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2) + if (err1 || err2 || err3) goto retry; ext4_es_print_tree(inode); @@ -2168,21 +2229,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, * @inode - file containing the range * @lblk - logical block defining the start of range * @len - length of range in blocks + * @prealloc - preallocated pending entry * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten * status. Must be called while holding i_es_lock. */ -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t end = lblk + len - 1; ext4_lblk_t first, last; bool f_del = false, l_del = false; + int ret = 0; if (len == 0) - return; + return 0; /* * Two cases - block range within single cluster and block range @@ -2203,7 +2267,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, f_del = __es_scan_range(inode, &ext4_es_is_delonly, first, lblk - 1); if (f_del) { - __insert_pending(inode, first); + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; } else { last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; @@ -2211,9 +2277,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, l_del = __es_scan_range(inode, &ext4_es_is_delonly, end + 1, last); - if (l_del) - __insert_pending(inode, last); - else + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + } else __remove_pending(inode, last); } } else { @@ -2221,18 +2289,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, if (first != lblk) f_del = __es_scan_range(inode, &ext4_es_is_delonly, first, lblk - 1); - if (f_del) - __insert_pending(inode, first); - else + if (f_del) { + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; + } else __remove_pending(inode, first); last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, &ext4_es_is_delonly, end + 1, last); - if (l_del) - __insert_pending(inode, last); - else + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + } else __remove_pending(inode, last); } +out: + return ret; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b06de728b3b6..87c009e0c59a 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1806,7 +1806,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, * at the end of the FC replay using our array of * modified inodes. */ - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } @@ -1875,7 +1875,7 @@ ext4_fc_replay_del_range(struct super_block *sb, if (ret > 0) { remaining -= ret; cur += ret; - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; @@ -1934,12 +1934,12 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 1); + path[j].p_block, 1, true); ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, - map.m_len, 1); + map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0f..6aa15dafc677 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -131,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) @@ -153,7 +153,7 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, { struct inode *inode = file_inode(in); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -306,80 +306,38 @@ out: } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, - ssize_t written, size_t count) + ssize_t count) { handle_t *handle; - bool truncate = false; - u8 blkbits = inode->i_blkbits; - ext4_lblk_t written_blk, end_blk; - int ret; - - /* - * Note that EXT4_I(inode)->i_disksize can get extended up to - * inode->i_size while the I/O was running due to writeback of delalloc - * blocks. But, the code in ext4_iomap_alloc() is careful to use - * zeroed/unwritten extents if this is possible; thus we won't leave - * uninitialized blocks in a file even if we didn't succeed in writing - * as much as we intended. - */ - WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); - if (offset + count <= EXT4_I(inode)->i_disksize) { - /* - * We need to ensure that the inode is removed from the orphan - * list if it has been added prematurely, due to writeback of - * delalloc blocks. - */ - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - return PTR_ERR(handle); - } - - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - } - - return written; - } - - if (written < 0) - goto truncate; + lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - written = PTR_ERR(handle); - goto truncate; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); - if (ext4_update_inode_size(inode, offset + written)) { - ret = ext4_mark_inode_dirty(handle, inode); + if (ext4_update_inode_size(inode, offset + count)) { + int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { - written = ret; ext4_journal_stop(handle); - goto truncate; + return ret; } } - /* - * We may need to truncate allocated but not written blocks beyond EOF. - */ - written_blk = ALIGN(offset + written, 1 << blkbits); - end_blk = ALIGN(offset + count, 1 << blkbits); - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - - /* - * Remove the inode from the orphan list if it has been extended and - * everything went OK. - */ - if (!truncate && inode->i_nlink) + if (inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); - if (truncate) { -truncate: + return count; +} + +/* + * Clean up the inode after DIO or DAX extending write has completed and the + * inode size has been updated using ext4_handle_inode_extension(). + */ +static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count) +{ + lockdep_assert_held_write(&inode->i_rwsem); + if (count < 0) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may @@ -388,9 +346,29 @@ truncate: */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); + return; } + /* + * If i_disksize got extended either due to writeback of delalloc + * blocks or extending truncate while the DIO was running we could fail + * to cleanup the orphan list in ext4_handle_inode_extension(). Do it + * now. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - return written; + if (IS_ERR(handle)) { + /* + * The write has successfully completed. Not much to + * do with the error here so just cleanup the orphan + * list and hope for the best. + */ + ext4_orphan_del(NULL, inode); + return; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, @@ -399,31 +377,23 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); + if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; - - if (size && flags & IOMAP_DIO_UNWRITTEN) { - error = ext4_convert_unwritten_extents(NULL, inode, pos, size); - if (error < 0) - return error; - } /* - * If we are extending the file, we have to update i_size here before - * page cache gets invalidated in iomap_dio_rw(). Otherwise racing - * buffered reads could zero out too much from page cache pages. Update - * of on-disk size will happen later in ext4_dio_write_iter() where - * we have enough information to also perform orphan list handling etc. - * Note that we perform all extending writes synchronously under - * i_rwsem held exclusively so i_size update is safe here in that case. - * If the write was not extending, we cannot see pos > i_size here - * because operations reducing i_size like truncate wait for all - * outstanding DIO before updating i_size. + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. Also we can race with truncate or write + * expanding the file so we have to be a bit careful here. */ - pos += size; - if (pos > i_size_read(inode)) - i_size_write(inode, pos); - - return 0; + if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && + pos + size <= i_size_read(inode)) + return size; + return ext4_handle_inode_extension(inode, pos, size); } static const struct iomap_dio_ops ext4_dio_write_ops = { @@ -476,6 +446,11 @@ restart: * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || @@ -492,21 +467,12 @@ restart: /* * Now that locking is settled, determine dio flags and exclusivity - * requirements. Unaligned writes are allowed under shared lock so long - * as they are pure overwrites. Set the iomap overwrite only flag as an - * added precaution in this case. Even though this is unnecessary, we - * can detect and warn on unexpected -EAGAIN if an unsafe unaligned - * write is ever submitted. - * - * Otherwise, concurrent unaligned writes risk data corruption due to - * partial block zeroing in the dio layer, and so the I/O must occur - * exclusively. The inode lock is already held exclusive if the write is - * non-overwrite or extending, so drain all outstanding dio and set the - * force wait dio flag. + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. */ - if (*ilock_shared && unaligned_io) { - *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; - } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -573,18 +539,20 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } + /* + * Prevent inline data from being created since we are going to allocate + * blocks for DIO. We know the inode does not currently have inline data + * because ext4_should_use_dio() checked for it, but we have to clear + * the state flag before the write checks because a lock cycle could + * introduce races with other writers. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; - /* - * Make sure inline data cannot be created anymore since we are going - * to allocate blocks for DIO. We know the inode does not have any - * inline data now because ext4_dio_supported() checked for that. - */ - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - offset = iocb->ki_pos; count = ret; @@ -608,12 +576,18 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); - WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; - - if (extend) - ret = ext4_handle_inode_extension(inode, offset, ret, count); + if (extend) { + /* + * We always perform extending DIO write synchronously so by + * now the IO is completed and ext4_handle_inode_extension() + * was called. Cleanup the inode in case of error or race with + * writeback of delalloc blocks. + */ + WARN_ON_ONCE(ret == -EIOCBQUEUED); + ext4_inode_extension_cleanup(inode, ret); + } out: if (ilock_shared) @@ -694,8 +668,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); - if (extend) - ret = ext4_handle_inode_extension(inode, offset, ret, count); + if (extend) { + ret = ext4_handle_inode_extension(inode, offset, ret); + ext4_inode_extension_cleanup(inode, ret); + } out: inode_unlock(inode); if (ret > 0) @@ -709,7 +685,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; #ifdef CONFIG_FS_DAX @@ -723,8 +699,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_FS_DAX -static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; @@ -740,7 +715,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is - * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ @@ -764,7 +739,7 @@ retry: } else { filemap_invalidate_lock_shared(mapping); } - result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); @@ -773,7 +748,7 @@ retry: goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) - result = dax_finish_sync_fault(vmf, pe_size, pfn); + result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { @@ -785,7 +760,7 @@ retry: static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { - return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); + return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { @@ -807,10 +782,9 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct dax_device *dax_dev = sbi->s_daxdev; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; /* @@ -886,7 +860,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index cdf9bfe10137..11e6f33677a2 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -576,8 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->s_journal_bdev && - fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev)) + if (EXT4_SB(sb)->s_journal_bdev_handle && + fm->fmr_device == + new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev)) return true; return false; } @@ -647,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->s_journal_bdev) { + if (EXT4_SB(sb)->s_journal_bdev_handle) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->s_journal_bdev->bd_dev); + EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0c56f3a011a1..b40d3b29f7e5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -131,9 +131,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ASSERT(ext4_journal_current_handle() == NULL); @@ -141,14 +140,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_mount_flags value */ + /* Make sure that we read updated s_ext4_flags value */ smp_rmb(); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(inode->i_sb)) ret = -EROFS; goto out; } - if (!sbi->s_journal) { + if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 46c3423ddfa1..deabe29da7fb 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -300,7 +300,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um && + if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 754f961cd9fd..e9bbb1da2d0a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -950,7 +950,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); @@ -1250,8 +1250,8 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - ei->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + ei->i_crtime = inode_get_mtime(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -1523,12 +1523,6 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; - /* This should not happen, but just to be sure check this */ - if (sb_rdonly(sb)) { - ret = 1; - goto out; - } - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4b7e4bc32d4..9a84a5f9fef4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -228,7 +228,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); @@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; @@ -1991,7 +1991,7 @@ out: ext4_orphan_del(handle, inode); if (err == 0) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 7935ea6cf92c..f0c0fd507fbc 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -245,9 +245,9 @@ static void inode_test_xtimestamp_decoding(struct kunit *test) struct timestamp_expectation *test_param = (struct timestamp_expectation *)(test->param_value); - timestamp.tv_sec = get_32bit_time(test_param); - ext4_decode_extra_time(×tamp, - cpu_to_le32(test_param->extra_bits)); + timestamp = ext4_decode_extra_time( + cpu_to_le32(get_32bit_time(test_param)), + cpu_to_le32(test_param->extra_bits)); KUNIT_EXPECT_EQ_MSG(test, test_param->expected.tv_sec, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..61277f7f8722 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -789,10 +789,22 @@ int ext4_get_block(struct inode *inode, sector_t iblock, int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + int ret = 0; + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, + ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + + /* + * If the buffer is marked unwritten, mark it as new to make sure it is + * zeroed out correctly in case of partial writes. Otherwise, there is + * a chance of stale data getting exposed. + */ + if (ret == 0 && buffer_unwritten(bh_result)) + set_buffer_new(bh_result); + + return ret; } /* Maximum number of blocks we map for direct IO at once. */ @@ -1020,10 +1032,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, BUG_ON(from > to); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); @@ -1114,7 +1124,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); @@ -1153,7 +1163,7 @@ retry_grab: * starting the handle. */ if (!folio_buffers(folio)) - create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); + create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); @@ -1569,7 +1579,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (folio->index < mpd->first_page) continue; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2213,8 +2223,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(EXT4_SB(sb)) || - ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2455,7 +2464,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = folio->index; - mpd->next_page = folio->index + folio_nr_pages(folio); + mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we @@ -2534,14 +2543,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because + * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || - ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } @@ -2759,7 +2767,7 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); @@ -2798,16 +2806,16 @@ static int ext4_dax_writepages(struct address_space *mapping, int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); @@ -2857,7 +2865,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; @@ -2937,14 +2945,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, return 1; } +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size; + + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding page lock, + * because page writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = (new_i_size - 1) & (PAGE_SIZE - 1); + if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + unlock_page(page); + put_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + if (disksize_changed) { + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + return copied; +} + static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - loff_t new_i_size; - unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); @@ -2963,30 +3030,7 @@ static int ext4_da_write_end(struct file *file, if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * Since we are holding inode lock, we are sure i_disksize <= - * i_size. We also know that if i_disksize < i_size, there are - * delalloc writes pending in the range upto i_size. If the end of - * the current write is <= i_size, there's no need to touch - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as certain - * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). - */ - new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(folio, end)) - ext4_update_i_disksize(inode, new_i_size); - - return generic_write_end(file, mapping, pos, len, copied, &folio->page, - fsdata); + return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); } /* @@ -3609,10 +3653,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); - if (!bh) { - create_empty_buffers(&folio->page, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; @@ -3986,7 +4028,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; @@ -4146,7 +4188,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; @@ -4249,9 +4291,9 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); @@ -4858,9 +4900,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } } - EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); - EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_GET_CTIME(inode, raw_inode); + EXT4_INODE_GET_ATIME(inode, raw_inode); + EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { @@ -4940,9 +4982,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; @@ -4981,9 +5026,9 @@ static void __ext4_update_other_inode_time(struct super_block *sb, spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); @@ -5131,11 +5176,10 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || - sb_rdonly(inode->i_sb)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -5255,7 +5299,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) @@ -5376,10 +5420,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * Update c/mtime on truncate up, ext4_truncate() will * update c/mtime in shrink case below */ - if (!shrink) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; - } + if (!shrink) + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); if (shrink) ext4_fc_track_range(handle, inode, @@ -5537,7 +5580,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } @@ -5676,7 +5719,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } @@ -5702,7 +5745,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); @@ -6140,7 +6183,7 @@ retry_alloc: if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 331859511f80..4f931f80cb34 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; unsigned long tmp; + struct timespec64 ts1, ts2; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_atime, inode2->i_atime); - swap(inode1->i_mtime, inode2->i_mtime); + + ts1 = inode_get_atime(inode1); + ts2 = inode_get_atime(inode2); + inode_set_atime_to_ts(inode1, ts2); + inode_set_atime_to_ts(inode2, ts1); + + ts1 = inode_get_mtime(inode1); + ts2 = inode_get_mtime(inode2); + inode_set_mtime_to_ts(inode1, ts2); + inode_set_mtime_to_ts(inode2, ts1); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; @@ -449,7 +458,8 @@ static long swap_inode_boot_loader(struct super_block *sb, diff = size - size_bl; swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = current_time(inode); + inode_set_ctime_current(inode); + inode_set_ctime_current(inode_bl); inode_inc_iversion(inode); inode->i_generation = get_random_u32(); @@ -663,7 +673,7 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_set_inode_flags(inode, false); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -774,7 +784,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -801,7 +811,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; - if (ext4_forced_shutdown(sbi)) + if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); @@ -1266,7 +1276,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c new file mode 100644 index 000000000000..f94901fd3835 --- /dev/null +++ b/fs/ext4/mballoc-test.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 multiblocks allocation. + */ + +#include <kunit/test.h> +#include <kunit/static_stub.h> + +#include "ext4.h" + +struct mbt_grp_ctx { + struct buffer_head bitmap_bh; + /* desc and gd_bh are just the place holders for now */ + struct ext4_group_desc desc; + struct buffer_head gd_bh; +}; + +struct mbt_ctx { + struct mbt_grp_ctx *grp_ctx; +}; + +struct mbt_ext4_super_block { + struct super_block sb; + struct mbt_ctx mbt_ctx; +}; + +#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx)) +#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) + +static struct super_block *mbt_ext4_alloc_super_block(void) +{ + struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL); + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + + if (fsb == NULL || sbi == NULL || es == NULL) + goto out; + + sbi->s_es = es; + fsb->sb.s_fs_info = sbi; + return &fsb->sb; + +out: + kfree(fsb); + kfree(sbi); + kfree(es); + return NULL; +} + +static void mbt_ext4_free_super_block(struct super_block *sb) +{ + struct mbt_ext4_super_block *fsb = + container_of(sb, struct mbt_ext4_super_block, sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + kfree(sbi->s_es); + kfree(sbi); + kfree(fsb); +} + +struct mbt_ext4_block_layout { + unsigned char blocksize_bits; + unsigned int cluster_bits; + uint32_t blocks_per_group; + ext4_group_t group_count; + uint16_t desc_size; +}; + +static void mbt_init_sb_layout(struct super_block *sb, + struct mbt_ext4_block_layout *layout) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sb->s_blocksize = 1UL << layout->blocksize_bits; + sb->s_blocksize_bits = layout->blocksize_bits; + + sbi->s_groups_count = layout->group_count; + sbi->s_blocks_per_group = layout->blocks_per_group; + sbi->s_cluster_bits = layout->cluster_bits; + sbi->s_cluster_ratio = 1U << layout->cluster_bits; + sbi->s_clusters_per_group = layout->blocks_per_group >> + layout->cluster_bits; + sbi->s_desc_size = layout->desc_size; + + es->s_first_data_block = cpu_to_le32(0); + es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * + layout->group_count); +} + +static int mbt_grp_ctx_init(struct super_block *sb, + struct mbt_grp_ctx *grp_ctx) +{ + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); + if (grp_ctx->bitmap_bh.b_data == NULL) + return -ENOMEM; + + return 0; +} + +static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx) +{ + kfree(grp_ctx->bitmap_bh.b_data); + grp_ctx->bitmap_bh.b_data = NULL; +} + +static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, + unsigned int start, unsigned int len) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); +} + +/* called after mbt_init_sb_layout */ +static int mbt_ctx_init(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx), + GFP_KERNEL); + if (ctx->grp_ctx == NULL) + return -ENOMEM; + + for (i = 0; i < ngroups; i++) + if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i])) + goto out; + + /* + * first data block(first cluster in first group) is used by + * metadata, mark it used to avoid to alloc data block at first + * block which will fail ext4_sb_block_valid check. + */ + mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + + return 0; +out: + while (i-- > 0) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); + return -ENOMEM; +} + +static void mbt_ctx_release(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); +} + +static struct buffer_head * +ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + /* paired with brelse from caller of ext4_read_block_bitmap_nowait */ + get_bh(&grp_ctx->bitmap_bh); + return &grp_ctx->bitmap_bh; +} + +static int ext4_wait_block_bitmap_stub(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + return 0; +} + +static struct ext4_group_desc * +ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group, + struct buffer_head **bh) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + if (bh != NULL) + *bh = &grp_ctx->gd_bh; + + return &grp_ctx->desc; +} + +static int +ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh; + + if (state) + mb_set_bits(bitmap_bh->b_data, blkoff, len); + else + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + + return 0; +} + +#define TEST_GOAL_GROUP 1 +static int mbt_kunit_init(struct kunit *test) +{ + struct mbt_ext4_block_layout *layout = + (struct mbt_ext4_block_layout *)(test->param_value); + struct super_block *sb; + int ret; + + sb = mbt_ext4_alloc_super_block(); + if (sb == NULL) + return -ENOMEM; + + mbt_init_sb_layout(sb, layout); + + ret = mbt_ctx_init(sb); + if (ret != 0) { + mbt_ext4_free_super_block(sb); + return ret; + } + + test->priv = sb; + kunit_activate_static_stub(test, + ext4_read_block_bitmap_nowait, + ext4_read_block_bitmap_nowait_stub); + kunit_activate_static_stub(test, + ext4_wait_block_bitmap, + ext4_wait_block_bitmap_stub); + kunit_activate_static_stub(test, + ext4_get_group_desc, + ext4_get_group_desc_stub); + kunit_activate_static_stub(test, + ext4_mb_mark_context, + ext4_mb_mark_context_stub); + return 0; +} + +static void mbt_kunit_exit(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); +} + +static void test_new_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode inode = { .i_sb = sb, }; + struct ext4_allocation_request ar; + ext4_group_t i, goal_group = TEST_GOAL_GROUP; + int err = 0; + ext4_fsblk_t found; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + ar.inode = &inode; + + /* get block at goal */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, + "failed to alloc block at goal, expected %llu found %llu", + ar.goal, found); + + /* get block after goal in goal group */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, + "failed to alloc block after goal in goal group, expected %llu found %llu", + ar.goal + 1, found); + + /* get block after goal group */ + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, goal_group + 1), found, + "failed to alloc block after goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, goal_group + 1), found); + + /* get block before goal group */ + for (i = goal_group; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, + "failed to alloc block before goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found); + + /* no block available, fail to allocate block */ + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_NE_MSG(test, err, 0, + "unexpectedly get block when no block is available"); +} + +static const struct mbt_ext4_block_layout mbt_test_layouts[] = { + { + .blocksize_bits = 10, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 12, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 16, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, +}; + +static void mbt_show_layout(const struct mbt_ext4_block_layout *layout, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d " + "blocks_per_group=%d group_count=%d desc_size=%d\n", + layout->blocksize_bits, layout->cluster_bits, + layout->blocks_per_group, layout->group_count, + layout->desc_size); +} +KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); + +static struct kunit_case mbt_test_cases[] = { + KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), + {} +}; + +static struct kunit_suite mbt_test_suite = { + .name = "ext4_mballoc_test", + .init = mbt_kunit_init, + .exit = mbt_kunit_exit, + .test_cases = mbt_test_cases, +}; + +kunit_test_suites(&mbt_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 21b903fe546e..d72b5e3c92ec 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -16,7 +16,9 @@ #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> +#include <linux/freezer.h> #include <trace/events/ext4.h> +#include <kunit/static_stub.h> /* * MUSTDO: @@ -416,8 +418,6 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, @@ -874,7 +874,7 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter, *grp; + struct ext4_group_info *iter; int i; if (ac->ac_status == AC_STATUS_FOUND) @@ -883,7 +883,6 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { if (list_empty(&sbi->s_mb_largest_free_orders[i])) continue; @@ -892,28 +891,22 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); continue; } - grp = NULL; list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - grp = iter; - break; + *group = iter->bb_group; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + return; } } read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - if (grp) - break; } - if (!grp) { - /* Increment cr and search again */ - *new_cr = CR_GOAL_LEN_FAST; - } else { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; - } + /* Increment cr and search again if no group is found */ + *new_cr = CR_GOAL_LEN_FAST; } /* @@ -966,16 +959,25 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - } else { + /* + * CR_BEST_AVAIL_LEN works based on the concept that we have + * a larger normalized goal len request which can be trimmed to + * a smaller goal len such that it can still satisfy original + * request len. However, allocation request for non-regular + * files never gets normalized. + * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). + */ + if (ac->ac_flags & EXT4_MB_HINT_DATA) *new_cr = CR_BEST_AVAIL_LEN; - } + else + *new_cr = CR_GOAL_LEN_SLOW; } /* @@ -1051,18 +1053,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context ac->ac_g_ex.fe_len); grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - } else { - /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; - } + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + *new_cr = CR_GOAL_LEN_SLOW; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) @@ -1080,8 +1080,9 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) * Return next linear group for allocation. If linear traversal should not be * performed, this function just returns the same group */ -static int -next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +static ext4_group_t +next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, + ext4_group_t ngroups) { if (!should_optimize_scan(ac)) goto inc_and_return; @@ -1255,7 +1256,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; - int blocksize; + unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; @@ -1359,17 +1360,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * We place the buddy block and bitmap block * close together */ + grinfo = ext4_get_group_info(sb, group); + if (!grinfo) { + err = -EFSCORRUPTED; + goto out; + } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(sb, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); - grinfo = ext4_get_group_info(sb, group); - if (!grinfo) { - err = -EFSCORRUPTED; - goto out; - } grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * @@ -1396,7 +1397,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); - ext4_mb_generate_from_freelist(sb, data, group); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be @@ -2450,7 +2451,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - if (ac->ac_criteria < CR_FAST) { + if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough @@ -2553,7 +2554,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; @@ -2634,7 +2635,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) + /* + * In all criterias except CR_ANY_FREE we try to avoid groups that + * can't possibly satisfy the full goal request due to insufficient + * free blocks. + */ + if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2658,7 +2664,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ - if (cr < CR_FAST && + if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2787,8 +2793,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * ac->ac_2order is set only if the fe_len is a power of 2 - * if ac->ac_2order is set we also set criteria to 0 so that we - * try exact allocation using buddy. + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED + * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; @@ -2800,10 +2806,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { - /* - * This should tell if fe_len is exactly power of 2 - */ - if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } @@ -2848,11 +2851,11 @@ repeat: /* * Batch reads of the block allocation bitmaps * to get multiple READs in flight; limit - * prefetching at cr=0/1, otherwise mballoc can - * spend a lot of time loading imperfect groups + * prefetching at inexpensive CR, otherwise mballoc + * can spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr >= CR_FAST || + (ext4_mb_cr_expensive(cr) || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { @@ -3501,11 +3504,10 @@ static void ext4_discard_work(struct work_struct *work) struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; - INIT_LIST_HEAD(&discard_list); spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); @@ -3628,7 +3630,8 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; - INIT_LIST_HEAD(&sbi->s_freed_data_list); + INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); + INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); @@ -3879,22 +3882,11 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct list_head freed_data_list; - struct list_head *cut_pos = NULL; + LIST_HEAD(freed_data_list); + struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; - INIT_LIST_HEAD(&freed_data_list); - - spin_lock(&sbi->s_md_lock); - list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { - if (entry->efd_tid != commit_tid) - break; - cut_pos = &entry->efd_list; - } - if (cut_pos) - list_cut_position(&freed_data_list, &sbi->s_freed_data_list, - cut_pos); - spin_unlock(&sbi->s_md_lock); + list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); @@ -3952,6 +3944,111 @@ void ext4_exit_mballoc(void) ext4_groupinfo_destroy_slabs(); } +#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 +#define EXT4_MB_SYNC_UPDATE 0x0002 +static int +ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + int err; + unsigned int i, already, changed = len; + + KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, + handle, sb, state, group, blkoff, len, + flags, ret_changed); + + if (ret_changed) + *ret_changed = 0; + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) + return PTR_ERR(bitmap_bh); + + if (handle) { + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + err = -EIO; + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + goto out_err; + + if (handle) { + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + } + + if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { + already = 0; + for (i = 0; i < len; i++) + if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == + state) + already++; + changed = len - already; + } + + if (state) { + mb_set_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) - changed); + } else { + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) + changed); + } + + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + if (ret_changed) + *ret_changed = changed; + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group); + struct flex_groups *fg = sbi_array_rcu_deref(sbi, + s_flex_groups, flex_group); + + if (state) + atomic64_sub(changed, &fg->free_clusters); + else + atomic64_add(changed, &fg->free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); + if (err) + goto out_err; + + if (flags & EXT4_MB_SYNC_UPDATE) { + sync_dirty_buffer(bitmap_bh); + sync_dirty_buffer(gdp_bh); + } + +out_err: + brelse(bitmap_bh); + return err; +} /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps @@ -3961,13 +4058,13 @@ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { - struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; + int flags = 0; + ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); @@ -3975,32 +4072,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); - if (IS_ERR(bitmap_bh)) { - return PTR_ERR(bitmap_bh); - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto out_err; - - err = -EIO; - gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) - goto out_err; - + return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); - BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); - if (err) - goto out_err; - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " @@ -4009,41 +4087,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + err = ext4_mb_mark_context(handle, sb, true, + ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, + 0, NULL); if (!err) err = -EFSCORRUPTED; - goto out_err; + return err; } - ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < ac->ac_b_ex.fe_len; i++) { - BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, - bitmap_bh->b_data)); - } - } + flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); - } - len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_group_clusters_set(sb, gdp, len); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); + err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, + flags, &changed); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + if (err && changed == 0) + return err; + +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != ac->ac_b_ex.fe_len); +#endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -4053,21 +4119,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, - ac->ac_b_ex.fe_group); - atomic64_sub(ac->ac_b_ex.fe_len, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (err) - goto out_err; - err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); - -out_err: - brelse(bitmap_bh); return err; } @@ -4076,17 +4127,13 @@ out_err: * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state) + int len, bool state) { - struct buffer_head *bitmap_bh = NULL; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err; - int already; - unsigned int clen, clen_changed, thisgrp_len; + int err = 0; + unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); @@ -4107,80 +4154,21 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); - bitmap_bh = NULL; - break; - } - - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - break; - } - - err = -EIO; - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) break; - - ext4_lock_group(sb, group); - already = 0; - for (i = 0; i < clen; i++) - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == - !state) - already++; - - clen_changed = clen - already; - if (state) - mb_set_bits(bitmap_bh->b_data, blkoff, clen); - else - mb_clear_bits(bitmap_bh->b_data, blkoff, clen); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, group, gdp)); - } - if (state) - clen = ext4_free_group_clusters(sb, gdp) - clen_changed; - else - clen = ext4_free_group_clusters(sb, gdp) + clen_changed; - - ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - - ext4_unlock_group(sb, group); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, group); - struct flex_groups *fg = sbi_array_rcu_deref(sbi, - s_flex_groups, flex_group); - - if (state) - atomic64_sub(clen_changed, &fg->free_clusters); - else - atomic64_add(clen_changed, &fg->free_clusters); - } - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - break; - sync_dirty_buffer(bitmap_bh); - err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(gdp_bh); + err = ext4_mb_mark_context(NULL, sb, state, + group, blkoff, clen, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); if (err) break; block += thisgrp_len; len -= thisgrp_len; - brelse(bitmap_bh); BUG_ON(len < 0); } - - if (err) - brelse(bitmap_bh); } /* @@ -4222,12 +4210,13 @@ ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_ static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t start, ext4_lblk_t end) + ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start; + loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); @@ -4236,7 +4225,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) @@ -4258,14 +4247,14 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; - ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; + loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; @@ -4284,7 +4273,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); @@ -4364,8 +4353,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, } if (left_pa) { - left_pa_end = - left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } @@ -4404,8 +4392,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; - ext4_lblk_t end; - loff_t size, start_off; + loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; @@ -4432,7 +4419,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -4491,6 +4478,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); + /* avoid unnecessary preallocation that may trigger assertions */ + if (start + size > EXT_MAX_BLOCKS) + size = EXT_MAX_BLOCKS - start; + /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; @@ -4766,7 +4757,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; - loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4862,9 +4852,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ - tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { + if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } @@ -4962,32 +4950,6 @@ try_group_pa: } /* - * the function goes through all block freed in the group - * but not yet committed and marks them used in in-core bitmap. - * buddy must be generated from this bitmap - * Need to be called with the ext4 group lock held - */ -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group) -{ - struct rb_node *n; - struct ext4_group_info *grp; - struct ext4_free_data *entry; - - grp = ext4_get_group_info(sb, group); - if (!grp) - return; - n = rb_first(&(grp->bb_free_root)); - - while (n) { - entry = rb_entry(n, struct ext4_free_data, efd_node); - mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); - n = rb_next(n); - } - return; -} - -/* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held @@ -5180,8 +5142,11 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { - int new_bex_start; - int new_bex_end; + struct ext4_free_extent ex = { + .fe_logical = ac->ac_g_ex.fe_logical, + .fe_len = ac->ac_orig_goal_len, + }; + loff_t orig_goal_end = extent_logical_end(sbi, &ex); /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -5200,29 +5165,23 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * still cover original start * 3. Else, keep the best ex at start of original request. */ - new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len); - new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical >= new_bex_start) - goto adjust_bex; + ex.fe_len = ac->ac_b_ex.fe_len; - new_bex_start = ac->ac_g_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical < new_bex_end) + ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); + if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; - new_bex_start = ac->ac_o_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + ex.fe_logical = ac->ac_g_ex.fe_logical; + if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) + goto adjust_bex; + ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: - ac->ac_b_ex.fe_logical = new_bex_start; + ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); - BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len))); + BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; @@ -5419,7 +5378,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; @@ -5448,7 +5407,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - INIT_LIST_HEAD(&list); ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { @@ -5529,7 +5487,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; @@ -5546,8 +5504,6 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active), needed); - INIT_LIST_HEAD(&list); - if (needed == 0) needed = UINT_MAX; @@ -5671,7 +5627,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5705,7 +5661,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; mb_debug(sb, "Can't allocate:" @@ -5738,12 +5694,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) #else static inline void ext4_mb_show_pa(struct super_block *sb) { - return; } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); - return; } #endif @@ -5769,7 +5723,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; @@ -5865,13 +5819,11 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, { ext4_group_t group = 0; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); - INIT_LIST_HEAD(&discard_list); - spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, @@ -5984,12 +5936,9 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ - if (lg_prealloc_count > 8) { + if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); - return; - } - return ; } /* @@ -6102,7 +6051,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || @@ -6147,7 +6096,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); - ext4_mb_mark_bb(sb, block, 1, 1); + ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; return block; @@ -6395,7 +6344,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } spin_lock(&sbi->s_md_lock); - list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); } @@ -6403,43 +6352,15 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; ext4_group_t group; ext4_grpblk_t blkoff; - int already_freed = 0, err, i; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - pr_warn("Failed to read block bitmap\n"); - return; - } - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - goto err_out; - - for (i = 0; i < count; i++) { - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) - already_freed++; - } - mb_clear_bits(bitmap_bh->b_data, blkoff, count); - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - goto err_out; - ext4_free_group_clusters_set( - sb, gdp, ext4_free_group_clusters(sb, gdp) + - count - already_freed); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(bitmap_bh); - sync_dirty_buffer(gdp_bh); - -err_out: - brelse(bitmap_bh); + ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); } /** @@ -6455,19 +6376,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { - struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; - struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; - int ret; + int mark_flags = 0; + ext4_grpblk_t changed; sbi = EXT4_SB(sb); @@ -6476,7 +6395,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; @@ -6500,55 +6419,35 @@ do_more: flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_return; - } - gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) { - err = -EIO; - goto error_return; - } + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) + goto error_out; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_clean; } - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) - goto error_return; #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < count_clusters; i++) - BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); - } + mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + count_clusters, mark_flags, &changed); - /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ - err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, - GFP_NOFS|__GFP_NOFAIL); - if (err) - goto error_return; + + if (err && changed == 0) + goto error_clean; + +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != count_clusters); +#endif /* * We need to make sure we don't reuse the freed block until after the @@ -6572,13 +6471,8 @@ do_more: new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { - /* need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count_clusters, NULL); @@ -6591,23 +6485,11 @@ do_more: EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_group_clusters(sb, gdp) + count_clusters; - ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(count_clusters, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they @@ -6620,30 +6502,19 @@ do_more: count_clusters); } - ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - if (overflow && !err) { block += count; count = overflow; - put_bh(bitmap_bh); + ext4_mb_unload_buddy(&e4b); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } -error_return: - brelse(bitmap_bh); + +error_clean: + ext4_mb_unload_buddy(&e4b); +error_out: ext4_std_error(sb, err); - return; } /** @@ -6746,7 +6617,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } ext4_mb_clear_bb(handle, inode, block, count, flags); - return; } /** @@ -6761,23 +6631,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int err = 0, ret, free_clusters_count; - ext4_grpblk_t clusters_freed; + int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; + ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - if (count == 0) + if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -6789,99 +6655,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; - goto error_return; + goto error_out; } - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_return; - } - - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) { - err = -EIO; - goto error_return; - } + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_out; if (!ext4_sb_block_valid(sb, NULL, block, count)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; - goto error_return; + goto error_clean; } - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_return; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) - goto error_return; - - for (i = 0, clusters_freed = 0; i < cluster_count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - clusters_freed++; - } - } + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, + &changed); + if (err && changed == 0) + goto error_clean; - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; + if (changed != cluster_count) + ext4_error(sb, "bit already cleared in group %u", block_group); - /* - * need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); mb_free_blocks(NULL, &e4b, bit, cluster_count); - free_clusters_count = clusters_freed + - ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, free_clusters_count); - ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - clusters_freed); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(clusters_freed, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } + changed); +error_clean: ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - -error_return: - brelse(bitmap_bh); +error_out: ext4_std_error(sb, err); return err; } @@ -6926,6 +6732,21 @@ __acquires(bitlock) return ret; } +static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, + ext4_group_t grp) +{ + if (grp < ext4_get_groups_count(sb)) + return EXT4_CLUSTERS_PER_GROUP(sb) - 1; + return (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp) - 1) >> + EXT4_CLUSTER_BITS(sb); +} + +static bool ext4_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) @@ -6933,11 +6754,13 @@ __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count; + bool set_trimmed = false; void *bitmap; bitmap = e4b->bd_bitmap; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; + if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group)) + set_trimmed = true; + start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6951,16 +6774,14 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) - break; + return count; count += next - start; } free_count += next - start; start = next + 1; - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } + if (ext4_trim_interrupted()) + return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); @@ -6972,6 +6793,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) break; } + if (set_trimmed) + EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); + return count; } @@ -6982,7 +6806,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count - * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy @@ -6992,7 +6815,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks, bool set_trimmed) + ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; @@ -7009,13 +6832,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < EXT4_SB(sb)->s_last_trim_minblks) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0 && set_trimmed) - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } else { + else ret = 0; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -7048,7 +6868,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); - bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -7067,10 +6886,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks - 1) { + if (end >= max_blks - 1) end = max_blks - 1; - eof = true; - } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -7084,9 +6901,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - whole_group = true; for (group = first_group; group <= last_group; group++) { + if (ext4_trim_interrupted()) + break; grp = ext4_get_group_info(sb, group); if (!grp) continue; @@ -7103,13 +6921,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) { + if (group == last_group) end = last_cluster; - whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; - } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen, whole_group); + end, minlen); if (cnt < 0) { ret = cnt; break; @@ -7154,8 +6970,7 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + start = max(e4b.bd_info->bb_first_free, start); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; @@ -7180,3 +6995,7 @@ out_unload: return error; } + +#ifdef CONFIG_EXT4_KUNIT_TESTS +#include "mballoc-test.c" +#endif diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index df6b5e7c2274..d7aeb5da7d86 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -233,6 +233,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0aaf38ffcb6e..bd946d0c71b7 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !sb_rdonly(sb)) { + while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b5af2fc03b2f..3aa57376d9c2 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -183,10 +183,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) blocksize = i_blocksize(inode); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = create_empty_buffers(folio, blocksize, 0); block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; @@ -340,10 +338,8 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } @@ -362,10 +358,8 @@ data_copy: /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } @@ -384,22 +378,20 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!folio_buffers(folio[0])) - create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); bh = folio_buffers(folio[0]); + if (!bh) + bh = create_empty_buffers(folio[0], + 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) - break; + goto repair_branches; bh = bh->b_this_page; } - if (!*err) - *err = block_commit_write(&folio[0]->page, from, from + replaced_size); - if (unlikely(*err < 0)) - goto repair_branches; + block_commit_write(&folio[0]->page, from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0caf6c730ce3..d252935f9c8a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -343,17 +343,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + - (EXT4_BLOCK_SIZE(inode->i_sb) - - sizeof(struct ext4_dir_entry_tail))); - while (d < top && d->rec_len) + (blocksize - sizeof(struct ext4_dir_entry_tail))); + while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + - le16_to_cpu(d->rec_len)); + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; @@ -364,7 +364,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, #endif if (t->det_reserved_zero1 || - le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != + sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; @@ -445,13 +446,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); - if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + if (rlen == blocksize) count_offset = 8; - else if (le16_to_cpu(dirent->rec_len) == 12) { + else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); - if (le16_to_cpu(dp->rec_len) != - EXT4_BLOCK_SIZE(inode->i_sb) - 12) + if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || @@ -1315,6 +1317,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; + int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); @@ -1335,11 +1338,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; - map_tail->size = le16_to_cpu(de->rec_len); + map_tail->size = ext4_rec_len_from_disk(de->rec_len, + blocksize); count++; cond_resched(); } - de = ext4_next_entry(de, dir->i_sb->s_blocksize); + de = ext4_next_entry(de, blocksize); } return count; } @@ -1445,7 +1449,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct dx_hash_info *hinfo = &name->hinfo; int len; - if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || + if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; @@ -1496,7 +1500,7 @@ static bool ext4_match(struct inode *parent, #endif #if IS_ENABLED(CONFIG_UNICODE) - if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && + if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, @@ -2203,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); @@ -2276,8 +2280,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, - (data2 + (blocksize - csum_size) - - (char *) de))) { + (char *)de - data2)) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; @@ -2393,7 +2396,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) + utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif @@ -2799,6 +2802,7 @@ static int ext4_add_nondir(handle_t *handle, return err; } drop_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; @@ -3142,7 +3146,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; /* Initialize quotas before so that eventual writes go in @@ -3197,7 +3201,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; @@ -3271,7 +3276,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) @@ -3286,7 +3291,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); @@ -3301,7 +3306,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; trace_ext4_unlink_enter(dir, dentry); @@ -3369,7 +3374,7 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, @@ -3436,6 +3441,7 @@ retry: err_drop_inode: clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) @@ -3463,7 +3469,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ext4_inc_count(inode); ihold(inode); @@ -3641,8 +3647,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); - ent->dir->i_ctime = ent->dir->i_mtime = - current_time(ent->dir); + inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3941,7 +3946,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = current_time(old.inode); + inode_set_ctime_current(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; @@ -3955,9 +3960,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.inode) { ext4_dec_count(new.inode); - new.inode->i_ctime = current_time(new.inode); + inode_set_ctime_current(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); + inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -4021,6 +4026,7 @@ end_rename: ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); @@ -4053,7 +4059,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; - struct timespec64 ctime; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, @@ -4147,9 +4152,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - ctime = current_time(old.inode); - old.inode->i_ctime = ctime; - new.inode->i_ctime = ctime; + inode_set_ctime_current(old.inode); + inode_set_ctime_current(new.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; @@ -4189,7 +4193,7 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3621f29ec671..dfdd7e5cf038 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -184,7 +184,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) io_end->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { + if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3e7d160f543f..21e8f0aebb3c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - struct folio *folio = fi.folio; - - if (bio->bi_status) - folio_clear_uptodate(folio); - else - folio_mark_uptodate(folio); - folio_unlock(folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); @@ -336,8 +329,7 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, true); continue; } } else if (fully_mapped) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0361c20910de..4fe061edefdd 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -10,8 +10,6 @@ */ -#define EXT4FS_DEBUG - #include <linux/errno.h> #include <linux/slab.h> #include <linux/jiffies.h> @@ -57,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb) * If the reserved GDT blocks is non-zero, the resize_inode feature * should always be set. */ - if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && + if (sbi->s_es->s_reserved_gdt_blocks && !ext4_has_feature_resize_inode(sb)) { ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); return -EFSCORRUPTED; @@ -69,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb) * bad time to do it anyways. */ if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + le32_to_cpu(sbi->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + (unsigned long long)sbi->s_sbh->b_blocknr); return -EPERM; } @@ -79,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb) * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + if (sbi->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " "so online resizing is not allowed"); return -EPERM; @@ -91,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb) } if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, - &EXT4_SB(sb)->s_ext4_flags)) + &sbi->s_ext4_flags)) ret = -EBUSY; return ret; @@ -106,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups) return 0; } -static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, - ext4_group_t group) { - return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << - EXT4_DESC_PER_BLOCK_BITS(sb); -} - -static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, - ext4_group_t group) { - group = ext4_meta_bg_first_group(sb, group); - return ext4_group_first_block_no(sb, group); -} - static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, ext4_group_t group) { ext4_grpblk_t overhead; @@ -154,8 +140,9 @@ static int verify_group_input(struct super_block *sb, overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; - input->free_clusters_count = free_blocks_count = - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + free_blocks_count = input->blocks_count - 2 - overhead - + sbi->s_itb_per_group; + input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " @@ -460,8 +447,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, last_cluster); - for (count2 = count; count > 0; - count -= count2, first_cluster += count2) { + for (; count > 0; count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; @@ -560,13 +546,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) goto handle_itb; - if (meta_bg == 1) { - ext4_group_t first_group; - first_group = ext4_meta_bg_first_group(sb, group); - if (first_group != group + 1 && - first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) - goto handle_itb; - } + if (meta_bg == 1) + goto handle_itb; block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ @@ -614,7 +595,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, } handle_itb: - /* Initialize group tables of the grop @group */ + /* Initialize group tables of the group @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; @@ -704,16 +685,14 @@ handle_ib: block = start; } - if (count) { - err = set_flexbg_block_bitmap(sb, handle, - flex_gd, - EXT4_B2C(sbi, start), - EXT4_B2C(sbi, - start + count - - 1)); - if (err) - goto out; - } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); + if (err) + goto out; } out: @@ -952,7 +931,13 @@ errout: } /* - * add_new_gdb_meta_bg is the sister of add_new_gdb. + * If there is no available space in the existing block group descriptors for + * the new block group and there are no reserved block group descriptors, then + * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set + * to the first block group that is managed using meta_bg and s_first_meta_bg + * must be a multiple of EXT4_DESC_PER_BLOCK(sb). + * This function will be called when first group of meta_bg is added to bring + * new group descriptors block of new added meta_bg. */ static int add_new_gdb_meta_bg(struct super_block *sb, handle_t *handle, ext4_group_t group) { @@ -962,8 +947,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int err; - gdblock = ext4_meta_bg_first_block_no(sb, group) + - ext4_bg_has_super(sb, group); + gdblock = ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); @@ -1087,9 +1072,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; - /* printk("reserving backup %lu[%u] = %lu\n", - primary[i]->b_blocknr, gdbackups, - blk + primary[i]->b_blocknr); */ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) @@ -1191,8 +1173,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, sb, bh, - EXT4_JTR_NONE))) + EXT4_JTR_NONE))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) @@ -1601,7 +1585,8 @@ exit_journal: int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); int meta_bg = ext4_has_feature_meta_bg(sb); - sector_t old_gdb = 0; + sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - + ext4_group_first_block_no(sb, 0); update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); @@ -1610,11 +1595,8 @@ exit_journal: gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); - if (old_gdb == gdb_bh->b_blocknr) - continue; - update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, - gdb_bh->b_size, meta_bg); - old_gdb = gdb_bh->b_blocknr; + update_backups(sb, gdb_bh->b_blocknr - padding_blocks, + gdb_bh->b_data, gdb_bh->b_size, meta_bg); } } exit: @@ -1980,9 +1962,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) errout: ret = ext4_journal_stop(handle); - if (!err) - err = ret; - return ret; + return err ? err : ret; invalid_resize_inode: ext4_error(sb, "corrupted/inconsistent resize inode"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ebf704616..c5fcf377ab1f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -93,6 +93,7 @@ static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); +static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* @@ -135,12 +136,12 @@ static struct file_system_type ext2_fs_type = { .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); -#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif @@ -151,12 +152,12 @@ static struct file_system_type ext3_fs_type = { .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); -#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, @@ -243,18 +244,25 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { - return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + ~__GFP_FS) | __GFP_MOVABLE; + + return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { - return __ext4_sb_bread_gfp(sb, block, 0, 0); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + ~__GFP_FS); + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { - struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); + struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, + sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) @@ -433,6 +441,57 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) +#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ + +/* + * The ext4_maybe_update_superblock() function checks and updates the + * superblock if needed. + * + * This function is designed to update the on-disk superblock only under + * certain conditions to prevent excessive disk writes and unnecessary + * waking of the disk from sleep. The superblock will be updated if: + * 1. More than an hour has passed since the last superblock update, and + * 2. More than 16MB have been written since the last superblock update. + * + * @sb: The superblock + */ +static void ext4_maybe_update_superblock(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + journal_t *journal = sbi->s_journal; + time64_t now; + __u64 last_update; + __u64 lifetime_write_kbytes; + __u64 diff_size; + + if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || + !journal || (journal->j_flags & JBD2_UNMOUNT)) + return; + + now = ktime_get_real_seconds(); + last_update = ext4_get_tstamp(es, s_wtime); + + if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + return; + + lifetime_write_kbytes = sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); + + /* Get the number of kilobytes not written to disk to account + * for statistics and compare with a multiple of 16 MB. This + * is used to determine when the next superblock commit should + * occur (i.e. not more often than once per 16MB if there was + * less written in an hour). + */ + diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); + + if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); +} + /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone @@ -459,6 +518,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); + ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { @@ -657,7 +717,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -671,7 +731,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * defer superblock flushing to a workqueue. */ if (continue_fs && journal) - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } @@ -698,10 +758,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_flags |= SB_RDONLY; } -static void flush_stashed_error_work(struct work_struct *work) +static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, - s_error_work); + s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; @@ -715,6 +775,8 @@ static void flush_stashed_error_work(struct work_struct *work) */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; + bool call_notify_err = false; + handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; @@ -722,6 +784,10 @@ static void flush_stashed_error_work(struct work_struct *work) jbd2_journal_stop(handle); goto write_directly; } + + if (sbi->s_add_error_count > 0) + call_notify_err = true; + ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " @@ -735,7 +801,10 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); - ext4_notify_error_sysfs(sbi); + + if (call_notify_err) + ext4_notify_error_sysfs(sbi); + return; } write_directly: @@ -758,7 +827,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -783,7 +852,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -818,7 +887,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -898,7 +967,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -992,7 +1061,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -1018,7 +1087,7 @@ __acquires(bitlock) if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } @@ -1096,54 +1165,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -static void ext4_bdev_mark_dead(struct block_device *bdev) -{ - ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH); -} - -static const struct blk_holder_ops ext4_holder_ops = { - .mark_dead = ext4_bdev_mark_dead, -}; - -/* - * Open the external journal device - */ -static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &ext4_holder_ops); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext4_msg(sb, KERN_ERR, - "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return NULL; -} - -/* - * Release the journal device - */ -static void ext4_blkdev_remove(struct ext4_sb_info *sbi) -{ - struct block_device *bdev; - bdev = sbi->s_journal_bdev; - if (bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - invalidate_bdev(bdev); - blkdev_put(bdev, sbi->s_sb); - sbi->s_journal_bdev = NULL; - } -} - static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; @@ -1278,10 +1299,10 @@ static void ext4_put_super(struct super_block *sb) * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL - * Unregister sysfs before flush sbi->s_error_work. + * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. - * flush_stashed_error_work will call start_this_handle may trigger + * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); @@ -1293,7 +1314,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); @@ -1338,9 +1359,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev) { - sync_blockdev(sbi->s_journal_bdev); - ext4_blkdev_remove(sbi); + if (sbi->s_journal_bdev_handle) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + sync_blockdev(sbi->s_journal_bdev_handle->bdev); + invalidate_bdev(sbi->s_journal_bdev_handle->bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -1628,6 +1654,7 @@ static const struct super_operations ext4_sops = { }; static const struct export_operations ext4_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, @@ -1897,6 +1924,7 @@ static const struct mount_opts { {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif + {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; @@ -1965,8 +1993,6 @@ struct ext4_fs_context { unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; - unsigned long vals_s_mount_flags; - unsigned long mask_s_mount_flags; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; @@ -2117,12 +2143,6 @@ EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); -static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit) -{ - set_bit(bit, &ctx->mask_s_mount_flags); - set_bit(bit, &ctx->vals_s_mount_flags); -} - static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; @@ -2186,9 +2206,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; - case Opt_abort: - ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2842,8 +2859,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; - sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; - sbi->s_mount_flags |= ctx->vals_s_mount_flags; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; @@ -4227,12 +4242,12 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->s_journal_bdev) + if (sbi->s_journal && !sbi->s_journal_bdev_handle) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); - if (j_inode) { + if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); @@ -4970,8 +4985,8 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before destroying the journal. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; @@ -5294,7 +5309,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); - INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) @@ -5572,7 +5587,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); - sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -5638,16 +5652,16 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before journal destroy. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - /* flush s_error_work before sbi destroy */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before sbi destroy */ + flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); @@ -5664,9 +5678,11 @@ failed_mount: kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); - /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(sbi->s_sbh); - ext4_blkdev_remove(sbi); + if (sbi->s_journal_bdev_handle) { + invalidate_bdev(sbi->s_journal_bdev_handle->bdev); + bdev_release(sbi->s_journal_bdev_handle); + } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; @@ -5772,22 +5788,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); - return NULL; + return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } - - ext4_debug("Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } + + ext4_debug("Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); return journal_inode; } @@ -5813,24 +5829,21 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block) return 0; } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static journal_t *ext4_open_inode_journal(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - journal_inode = ext4_get_journal_inode(sb, journal_inum); - if (!journal_inode) - return NULL; + if (IS_ERR(journal_inode)) + return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); - return NULL; + return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; @@ -5838,40 +5851,49 @@ static journal_t *ext4_get_journal(struct super_block *sb, return journal; } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, + dev_t j_dev, ext4_fsblk_t *j_start, + ext4_fsblk_t *j_len) { struct buffer_head *bh; - journal_t *journal; - ext4_fsblk_t start; - ext4_fsblk_t len; + struct block_device *bdev; + struct bdev_handle *bdev_handle; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; - struct block_device *bdev; - - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; + int errno; - bdev = ext4_blkdev_get(j_dev, sb); - if (bdev == NULL) - return NULL; + /* see get_tree_bdev why this is needed and safe */ + up_write(&sb->s_umount); + bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + sb, &fs_holder_ops); + down_write(&sb->s_umount); + if (IS_ERR(bdev_handle)) { + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle)); + return bdev_handle; + } + bdev = bdev_handle->bdev; blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); + errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { + bh = __bread(bdev, sb_block, blocksize); + if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); + errno = -EINVAL; goto out_bdev; } @@ -5879,57 +5901,74 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "corrupt superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + errno = -EFSCORRUPTED; + goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); - brelse(bh); - goto out_bdev; + errno = -EFSCORRUPTED; + goto out_bh; } - len = ext4_blocks_count(es); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ + *j_start = sb_block + 1; + *j_len = ext4_blocks_count(es); + brelse(bh); + return bdev_handle; - journal = jbd2_journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); - if (!journal) { +out_bh: + brelse(bh); +out_bdev: + bdev_release(bdev_handle); + return ERR_PTR(errno); +} + +static journal_t *ext4_open_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + journal_t *journal; + ext4_fsblk_t j_start; + ext4_fsblk_t j_len; + struct bdev_handle *bdev_handle; + int errno = 0; + + bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_handle)) + return ERR_CAST(bdev_handle); + + journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start, + j_len, sb->s_blocksize); + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); + errno = PTR_ERR(journal); goto out_bdev; } - journal->j_private = sb; - if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); + errno = -EINVAL; goto out_journal; } - EXT4_SB(sb)->s_journal_bdev = bdev; + journal->j_private = sb; + EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - blkdev_put(bdev, sb); - return NULL; + bdev_release(bdev_handle); + return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, @@ -5961,13 +6000,13 @@ static int ext4_load_journal(struct super_block *sb, } if (journal_inum) { - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; + journal = ext4_open_inode_journal(sb, journal_inum); + if (IS_ERR(journal)) + return PTR_ERR(journal); } else { - journal = ext4_get_dev_journal(sb, journal_dev); - if (!journal) - return -EINVAL; + journal = ext4_open_dev_journal(sb, journal_dev); + if (IS_ERR(journal)) + return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); @@ -6084,7 +6123,7 @@ static void ext4_update_super(struct super_block *sb) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & SB_RDONLY)) + if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + @@ -6282,13 +6321,7 @@ static int ext4_clear_journal_err(struct super_block *sb, */ int ext4_force_commit(struct super_block *sb) { - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; - return ext4_journal_force_commit(journal); + return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -6298,7 +6331,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return 0; trace_ext4_sync_fs(sb, wait); @@ -6347,12 +6380,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) static int ext4_freeze(struct super_block *sb) { int error = 0; - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; + journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -6386,7 +6414,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6425,6 +6453,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; + int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; @@ -6465,7 +6494,16 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } + /* + * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause + * two calls to ext4_should_dioread_nolock() to return inconsistent + * values, triggering WARN_ON in ext4_add_complete_io(). we grab + * here s_writepages_rwsem to avoid race between writepages ops and + * remount. + */ + alloc_ctx = ext4_writepages_down_write(sb); ext4_apply_options(fc, sb); + ext4_writepages_up_write(sb, alloc_ctx); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -6502,7 +6540,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -6516,10 +6554,10 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } /* Flush outstanding errors before changing fs state */ - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { + if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } @@ -6680,9 +6718,11 @@ restore_opts: * If there was a failing r/w to ro transition, we may need to * re-enable quota */ - if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); + + alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -6691,6 +6731,8 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + ext4_writepages_up_write(sb, alloc_ctx); + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA @@ -7089,6 +7131,13 @@ static int ext4_quota_off(struct super_block *sb, int type) err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; + /* + * When the filesystem was remounted read-only first, we cannot cleanup + * inode flags here. Bad luck but people should be using QUOTA feature + * these days anyway. + */ + if (sb_rdonly(sb)) + goto out_put; inode_lock(inode); /* @@ -7103,7 +7152,7 @@ static int ext4_quota_off(struct super_block *sb, int type) } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: @@ -7273,12 +7322,23 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 1; } +static void ext4_kill_sb(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL; + + kill_block_super(sb); + + if (handle) + bdev_release(handle); +} + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 05151d61b00b..82dc5e673d5c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; -const struct xattr_handler *ext4_xattr_handlers[] = { +const struct xattr_handler * const ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -356,24 +356,24 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { - return ((u64)ea_inode->i_ctime.tv_sec << 32) | + return ((u64) inode_get_ctime_sec(ea_inode) << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { - ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { - return (u32)ea_inode->i_atime.tv_sec; + return (u32) inode_get_atime_sec(ea_inode); } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { - ea_inode->i_atime.tv_sec = hash; + inode_set_atime(ea_inode, hash, 0); } /* @@ -418,7 +418,7 @@ free_bhs: return ret; } -#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) @@ -701,7 +701,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) @@ -2473,7 +2473,7 @@ retry_inode: } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); if (!value) no_expand = 0; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 824faf0b15a8..bd97c4aa8177 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -193,7 +193,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); extern void ext4_evict_ea_inode(struct inode *inode); -extern const struct xattr_handler *ext4_xattr_handlers[]; +extern const struct xattr_handler * const ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 03ef087537c7..68a1e23e1557 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,6 +2,7 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select BUFFER_HEAD select NLS select CRYPTO select CRYPTO_CRC32 diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 8fd3b7f9fb88..b0597a539fc5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1701,9 +1701,9 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } f2fs_restore_inmem_curseg(sbi); + stat_inc_cp_count(sbi); stop: unblock_operations(sbi); - stat_inc_cp_count(sbi->stat_info); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 236d890f560b..36e5dab6baae 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -649,13 +649,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) goto destroy_compress_ctx; } - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) cc->cpages[i] = f2fs_compress_alloc_page(); - if (!cc->cpages[i]) { - ret = -ENOMEM; - goto out_free_cpages; - } - } cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size); if (!cc->rbuf) { @@ -898,14 +893,15 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc) bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { +#ifdef CONFIG_F2FS_CHECK_FS struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; - bool compressed = dn->data_blkaddr == COMPRESS_ADDR; int cluster_end = 0; + unsigned int count; int i; char *reason = ""; - if (!compressed) + if (dn->data_blkaddr != COMPRESS_ADDR) return false; /* [..., COMPR_ADDR, ...] */ @@ -914,7 +910,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) goto out; } - for (i = 1; i < cluster_size; i++) { + for (i = 1, count = 1; i < cluster_size; i++, count++) { block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node + i); @@ -934,19 +930,42 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) goto out; } } + + f2fs_bug_on(F2FS_I_SB(dn->inode), count != cluster_size && + !is_inode_flag_set(dn->inode, FI_COMPRESS_RELEASED)); + return false; out: f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); set_sbi_flag(sbi, SBI_NEED_FSCK); return true; +#else + return false; +#endif +} + +static int __f2fs_get_cluster_blocks(struct inode *inode, + struct dnode_of_data *dn) +{ + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + int count, i; + + for (i = 1, count = 1; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (__is_valid_data_blkaddr(blkaddr)) + count++; + } + + return count; } static int __f2fs_cluster_blocks(struct inode *inode, - unsigned int cluster_idx, bool compr) + unsigned int cluster_idx, bool compr_blks) { struct dnode_of_data dn; - unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; unsigned int start_idx = cluster_idx << F2FS_I(inode)->i_log_cluster_size; int ret; @@ -961,31 +980,14 @@ static int __f2fs_cluster_blocks(struct inode *inode, if (f2fs_sanity_check_cluster(&dn)) { ret = -EFSCORRUPTED; - f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_CLUSTER); goto fail; } if (dn.data_blkaddr == COMPRESS_ADDR) { - int i; - - ret = 1; - for (i = 1; i < cluster_size; i++) { - block_t blkaddr; - - blkaddr = data_blkaddr(dn.inode, - dn.node_page, dn.ofs_in_node + i); - if (compr) { - if (__is_valid_data_blkaddr(blkaddr)) - ret++; - } else { - if (blkaddr != NULL_ADDR) - ret++; - } - } - - f2fs_bug_on(F2FS_I_SB(inode), - !compr && ret != cluster_size && - !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)); + if (compr_blks) + ret = __f2fs_get_cluster_blocks(inode, &dn); + else + ret = 1; } fail: f2fs_put_dnode(&dn); @@ -998,7 +1000,7 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc) return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); } -/* return # of valid blocks in compressed cluster */ +/* return whether cluster is compressed one or not */ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { return __f2fs_cluster_blocks(inode, @@ -1045,7 +1047,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, struct address_space *mapping = cc->inode->i_mapping; struct page *page; sector_t last_block_in_bio; - unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; @@ -1574,8 +1576,6 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, } dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) - return -ENOMEM; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); @@ -1656,11 +1656,6 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) struct page *page; page = f2fs_compress_alloc_page(); - if (!page) { - ret = -ENOMEM; - goto out_free; - } - f2fs_set_compressed_page(page, cc->inode, start_idx + i + 1, dic); dic->cpages[i] = page; @@ -1988,7 +1983,7 @@ void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; - char slab_name[32]; + char slab_name[35]; if (!f2fs_sb_has_compression(sbi)) return 0; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5882afe71d82..4e42b5f24deb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1167,6 +1167,9 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, f2fs_wait_on_block_writeback(inode, blkaddr); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + iostat_update_and_unbind_ctx(bio); + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); return -EFAULT; } @@ -1389,18 +1392,14 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, { struct address_space *mapping = inode->i_mapping; struct page *page; -repeat: + page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); if (IS_ERR(page)) return page; /* wait for read completion */ lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); - goto repeat; - } - if (unlikely(!PageUptodate(page))) { + if (unlikely(page->mapping != mapping || !PageUptodate(page))) { f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -1691,9 +1690,7 @@ next_block: map->m_flags |= F2FS_MAP_NEW; } else if (is_hole) { if (f2fs_compressed_file(inode) && - f2fs_sanity_check_cluster(&dn) && - (flag != F2FS_GET_BLOCK_FIEMAP || - IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + f2fs_sanity_check_cluster(&dn)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_CLUSTER); @@ -2345,8 +2342,10 @@ skip_reading_dnode: f2fs_wait_on_block_writeback(inode, blkaddr); if (f2fs_load_compressed_page(sbi, page, blkaddr)) { - if (atomic_dec_and_test(&dic->remaining_pages)) + if (atomic_dec_and_test(&dic->remaining_pages)) { f2fs_decompress_cluster(dic, true); + break; + } continue; } @@ -2666,6 +2665,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */ + if (f2fs_compressed_file(inode) && + F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER && + is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) + return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) @@ -3024,7 +3028,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0, retry = 0; - struct page *pages[F2FS_ONSTACK_PAGES]; + struct page *pages_local[F2FS_ONSTACK_PAGES]; + struct page **pages = pages_local; struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; @@ -3048,6 +3053,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, #endif int nr_folios, p, idx; int nr_pages; + unsigned int max_pages = F2FS_ONSTACK_PAGES; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -3057,6 +3063,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int submitted = 0; int i; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode) && + 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) { + pages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL); + max_pages = 1 << cc.log_cluster_size; + } +#endif + folio_batch_init(&fbatch); if (get_dirty_pages(mapping->host) <= @@ -3102,7 +3117,7 @@ again: add_more: pages[nr_pages] = folio_page(folio, idx); folio_get(folio); - if (++nr_pages == F2FS_ONSTACK_PAGES) { + if (++nr_pages == max_pages) { index = folio->index + idx + 1; folio_batch_release(&fbatch); goto write; @@ -3236,8 +3251,7 @@ result: } goto next; } - done_index = folio->index + - folio_nr_pages(folio); + done_index = folio_next_index(folio); done = 1; break; } @@ -3285,6 +3299,11 @@ next: if (bio) f2fs_submit_merged_ipu_write(sbi, &bio, NULL); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (pages != pages_local) + kfree(pages); +#endif + return ret; } @@ -4057,7 +4076,7 @@ next: sis->highest_bit = cur_lblock - 1; out: if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)", + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 61c35b59126e..fdbf994f1271 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -215,6 +215,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_blks[type] += blks; } + for (i = 0; i < MAX_CALL_TYPE; i++) + si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]); + for (i = 0; i < 2; i++) { si->segment_count[i] = sbi->segment_count[i]; si->block_count[i] = sbi->block_count[i]; @@ -497,7 +500,9 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); seq_printf(s, "CP calls: %d (BG: %d)\n", - si->cp_count, si->bg_cp_count); + si->cp_call_count[TOTAL_CALL], + si->cp_call_count[BACKGROUND]); + seq_printf(s, "CP count: %d\n", si->cp_count); seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]); seq_printf(s, " - sit blocks : %u\n", si->meta_count[META_SIT]); @@ -511,12 +516,24 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt); seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time); seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time); - seq_printf(s, "GC calls: %d (BG: %d)\n", - si->call_count, si->bg_gc); - seq_printf(s, " - data segments : %d (%d)\n", - si->data_segs, si->bg_data_segs); - seq_printf(s, " - node segments : %d (%d)\n", - si->node_segs, si->bg_node_segs); + seq_printf(s, "GC calls: %d (gc_thread: %d)\n", + si->gc_call_count[BACKGROUND] + + si->gc_call_count[FOREGROUND], + si->gc_call_count[BACKGROUND]); + if (__is_large_section(sbi)) { + seq_printf(s, " - data sections : %d (BG: %d)\n", + si->gc_secs[DATA][BG_GC] + si->gc_secs[DATA][FG_GC], + si->gc_secs[DATA][BG_GC]); + seq_printf(s, " - node sections : %d (BG: %d)\n", + si->gc_secs[NODE][BG_GC] + si->gc_secs[NODE][FG_GC], + si->gc_secs[NODE][BG_GC]); + } + seq_printf(s, " - data segments : %d (BG: %d)\n", + si->gc_segs[DATA][BG_GC] + si->gc_segs[DATA][FG_GC], + si->gc_segs[DATA][BG_GC]); + seq_printf(s, " - node segments : %d (BG: %d)\n", + si->gc_segs[NODE][BG_GC] + si->gc_segs[NODE][FG_GC], + si->gc_segs[NODE][BG_GC]); seq_puts(s, " - Reclaimed segs :\n"); seq_printf(s, " - Normal : %d\n", sbi->gc_reclaimed_segs[GC_NORMAL]); seq_printf(s, " - Idle CB : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_CB]); @@ -687,6 +704,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); + for (i = 0; i < MAX_CALL_TYPE; i++) + atomic_set(&sbi->cp_call_count[i], 0); atomic_set(&sbi->max_aw_cnt, 0); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d635c58cf5a3..042593aed1ec 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -858,7 +858,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 0e2d49140c07..ad8dfac73bd4 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -74,40 +74,14 @@ static void __set_extent_info(struct extent_info *ei, } } -static bool __may_read_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, READ_EXTENT_CACHE)) - return false; - if (is_inode_flag_set(inode, FI_NO_EXTENT)) - return false; - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && - !f2fs_sb_has_readonly(sbi)) - return false; - return S_ISREG(inode->i_mode); -} - -static bool __may_age_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, AGE_EXTENT_CACHE)) - return false; - if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return false; - if (file_is_cold(inode)) - return false; - - return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); -} - static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) { if (type == EX_READ) - return __may_read_extent_tree(inode); - else if (type == EX_BLOCK_AGE) - return __may_age_extent_tree(inode); + return test_opt(F2FS_I_SB(inode), READ_EXTENT_CACHE) && + S_ISREG(inode->i_mode); + if (type == EX_BLOCK_AGE) + return test_opt(F2FS_I_SB(inode), AGE_EXTENT_CACHE) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)); return false; } @@ -120,7 +94,22 @@ static bool __may_extent_tree(struct inode *inode, enum extent_type type) if (list_empty(&F2FS_I_SB(inode)->s_list)) return false; - return __init_may_extent_tree(inode, type); + if (!__init_may_extent_tree(inode, type)) + return false; + + if (type == EX_READ) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return false; + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(F2FS_I_SB(inode))) + return false; + } else if (type == EX_BLOCK_AGE) { + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return false; + if (file_is_cold(inode)) + return false; + } + return true; } static void __try_update_largest_extent(struct extent_tree *et, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7cb2177b252..9043cedfa12b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1234,6 +1234,7 @@ struct f2fs_bio_info { #define FDEV(i) (sbi->devs[i]) #define RDEV(i) (raw_super->devs[i]) struct f2fs_dev_info { + struct bdev_handle *bdev_handle; struct block_device *bdev; char path[MAX_PATH_LEN]; unsigned int total_segments; @@ -1383,6 +1384,13 @@ enum errors_option { MOUNT_ERRORS_PANIC, /* panic on errors */ }; +enum { + BACKGROUND, + FOREGROUND, + MAX_CALL_TYPE, + TOTAL_CALL = FOREGROUND, +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1695,6 +1703,7 @@ struct f2fs_sb_info { unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ + atomic_t cp_call_count[MAX_CALL_TYPE]; /* # of cp call */ #endif spinlock_t stat_lock; /* lock for stat operations */ @@ -2114,15 +2123,6 @@ static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem) return down_read_trylock(&sem->internal_rwsem); } -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) -{ - down_read_nested(&sem->internal_rwsem, subclass); -} -#else -#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) -#endif - static inline void f2fs_up_read(struct f2fs_rwsem *sem) { up_read(&sem->internal_rwsem); @@ -2133,6 +2133,21 @@ static inline void f2fs_down_write(struct f2fs_rwsem *sem) down_write(&sem->internal_rwsem); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_read_nested(&sem->internal_rwsem, subclass); +} + +static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass) +{ + down_write_nested(&sem->internal_rwsem, subclass); +} +#else +#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem) +#define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem) +#endif + static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem) { return down_write_trylock(&sem->internal_rwsem); @@ -2736,7 +2751,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp_mask) + fgf_t fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) return NULL; @@ -3303,11 +3318,15 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_is_time_consistent(struct inode *inode) { - if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + struct timespec64 ts = inode_get_atime(inode); + + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + ts = inode_get_ctime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + ts = inode_get_mtime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) return false; return true; } @@ -3885,7 +3904,7 @@ struct f2fs_stat_info { int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_wb_cp_data, nr_wb_data; int nr_rd_data, nr_rd_node, nr_rd_meta; int nr_dio_read, nr_dio_write; unsigned int io_skip_bggc, other_skip_bggc; @@ -3905,9 +3924,11 @@ struct f2fs_stat_info { int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages, compress_pages; int compress_page_hit; - int prefree_count, call_count, cp_count, bg_cp_count; - int tot_segs, node_segs, data_segs, free_segs, free_secs; - int bg_node_segs, bg_data_segs; + int prefree_count, free_segs, free_secs; + int cp_call_count[MAX_CALL_TYPE], cp_count; + int gc_call_count[MAX_CALL_TYPE]; + int gc_segs[2][2]; + int gc_secs[2][2]; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; int curseg[NR_CURSEG_TYPE]; @@ -3929,10 +3950,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) return (struct f2fs_stat_info *)sbi->stat_info; } -#define stat_inc_cp_count(si) ((si)->cp_count++) -#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) -#define stat_inc_call_count(si) ((si)->call_count++) -#define stat_inc_bggc_count(si) ((si)->bg_gc++) +#define stat_inc_cp_call_count(sbi, foreground) \ + atomic_inc(&sbi->cp_call_count[(foreground)]) +#define stat_inc_cp_count(si) (F2FS_STAT(sbi)->cp_count++) #define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) @@ -4017,18 +4037,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) -#define stat_inc_seg_count(sbi, type, gc_type) \ - do { \ - struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - si->tot_segs++; \ - if ((type) == SUM_TYPE_DATA) { \ - si->data_segs++; \ - si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ - } else { \ - si->node_segs++; \ - si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \ - } \ - } while (0) +#define stat_inc_gc_call_count(sbi, foreground) \ + (F2FS_STAT(sbi)->gc_call_count[(foreground)]++) +#define stat_inc_gc_sec_count(sbi, type, gc_type) \ + (F2FS_STAT(sbi)->gc_secs[(type)][(gc_type)]++) +#define stat_inc_gc_seg_count(sbi, type, gc_type) \ + (F2FS_STAT(sbi)->gc_segs[(type)][(gc_type)]++) #define stat_inc_tot_blk_count(si, blks) \ ((si)->tot_blks += (blks)) @@ -4055,10 +4069,8 @@ void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else -#define stat_inc_cp_count(si) do { } while (0) -#define stat_inc_bg_cp_count(si) do { } while (0) -#define stat_inc_call_count(si) do { } while (0) -#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_cp_call_count(sbi, foreground) do { } while (0) +#define stat_inc_cp_count(sbi) do { } while (0) #define stat_io_skip_bggc_count(sbi) do { } while (0) #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) @@ -4086,7 +4098,9 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) #define stat_inc_inplace_blocks(sbi) do { } while (0) -#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_gc_call_count(sbi, foreground) do { } while (0) +#define stat_inc_gc_sec_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_gc_seg_count(sbi, type, gc_type) do { } while (0) #define stat_inc_tot_blk_count(si, blks) do { } while (0) #define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) @@ -4423,6 +4437,22 @@ static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, } #endif +static inline int f2fs_bdev_index(struct f2fs_sb_info *sbi, + struct block_device *bdev) +{ + int i; + + if (!f2fs_is_multi_device(sbi)) + return 0; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return i; + + WARN_ON(1); + return -1; +} + static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { return f2fs_sb_has_blkzoned(sbi); @@ -4483,7 +4513,8 @@ static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || - f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode)) + f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode) || + f2fs_is_mmap_file(inode)) return false; return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 093039dee992..e50363583f01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -159,7 +159,7 @@ out_sem: sb_end_pagefault(inode->i_sb); err: - return block_page_mkwrite_return(err); + return vmf_fs_error(err); } static const struct vm_operations_struct f2fs_file_vm_ops = { @@ -526,7 +526,11 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; + + f2fs_down_read(&F2FS_I(inode)->i_sem); set_inode_flag(inode, FI_MMAP_FILE); + f2fs_up_read(&F2FS_I(inode)->i_sem); + return 0; } @@ -794,7 +798,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -882,7 +886,7 @@ int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -901,11 +905,11 @@ static void __setattr_copy(struct mnt_idmap *idmap, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); @@ -1008,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return err; spin_lock(&F2FS_I(inode)->i_size_lock); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); F2FS_I(inode)->last_disk_size = i_size_read(inode); spin_unlock(&F2FS_I(inode)->i_size_lock); } @@ -1724,6 +1728,7 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err && err != -ENODATA) goto out_err; @@ -1835,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1919,12 +1924,19 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) int err = f2fs_convert_inline_inode(inode); if (err) return err; - if (!f2fs_may_compress(inode)) - return -EINVAL; - if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) + + f2fs_down_write(&F2FS_I(inode)->i_sem); + if (!f2fs_may_compress(inode) || + (S_ISREG(inode->i_mode) && + F2FS_HAS_BLOCKS(inode))) { + f2fs_up_write(&F2FS_I(inode)->i_sem); return -EINVAL; - if (set_compress_context(inode)) - return -EOPNOTSUPP; + } + err = set_compress_context(inode); + f2fs_up_write(&F2FS_I(inode)->i_sem); + + if (err) + return err; } } @@ -1937,7 +1949,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) else clear_inode_flag(inode, FI_PROJ_INHERIT); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_set_inode_flags(inode); f2fs_mark_inode_dirty_sync(inode, true); return 0; @@ -2465,6 +2477,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) gc_control.init_gc_type = sync ? FG_GC : BG_GC; gc_control.err_gc_skipped = sync; + stat_inc_gc_call_count(sbi, FOREGROUND); ret = f2fs_gc(sbi, &gc_control); out: mnt_drop_write_file(filp); @@ -2508,6 +2521,7 @@ do_more: } gc_control.victim_segno = GET_SEGNO(sbi, range->start); + stat_inc_gc_call_count(sbi, FOREGROUND); ret = f2fs_gc(sbi, &gc_control); if (ret) { if (ret == -EBUSY) @@ -2874,10 +2888,10 @@ out_src: if (ret) goto out_unlock; - src->i_mtime = src->i_ctime = current_time(src); + inode_set_mtime_to_ts(src, inode_set_ctime_current(src)); f2fs_mark_inode_dirty_sync(src, false); if (src != dst) { - dst->i_mtime = dst->i_ctime = current_time(dst); + inode_set_mtime_to_ts(dst, inode_set_ctime_current(dst)); f2fs_mark_inode_dirty_sync(dst, false); } f2fs_update_time(sbi, REQ_TIME); @@ -2990,6 +3004,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[ALLOC_NEXT] = end_segno + 1; gc_control.victim_segno = start_segno; + stat_inc_gc_call_count(sbi, FOREGROUND); ret = f2fs_gc(sbi, &gc_control); if (ret == -EAGAIN) ret = 0; @@ -3073,7 +3088,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) goto out_unlock; fi->i_projid = kprojid; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: f2fs_unlock_op(sbi); @@ -3243,11 +3258,12 @@ int f2fs_precache_extents(struct inode *inode) return -EOPNOTSUPP; map.m_lblk = 0; + map.m_pblk = 0; map.m_next_pgofs = NULL; map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = max_file_blocks(inode); + end = F2FS_BLK_ALIGN(i_size_read(inode)); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; @@ -3255,7 +3271,7 @@ int f2fs_precache_extents(struct inode *inode) f2fs_down_write(&fi->i_gc_rwsem[WRITE]); err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRECACHE); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); - if (err) + if (err || !map.m_len) return err; map.m_lblk = m_next_extent; @@ -3511,7 +3527,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } set_inode_flag(inode, FI_COMPRESS_RELEASED); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -3710,7 +3726,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); } unlock_inode: @@ -3976,6 +3992,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) file_start_write(filp); inode_lock(inode); + f2fs_down_write(&F2FS_I(inode)->i_sem); if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) { ret = -EBUSY; goto out; @@ -3989,12 +4006,22 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) F2FS_I(inode)->i_compress_algorithm = option.algorithm; F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); + /* Set default level */ + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) + F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + else + F2FS_I(inode)->i_compress_level = 0; + /* Adjust mount option level */ + if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level; f2fs_mark_inode_dirty_sync(inode, true); if (!f2fs_is_compress_backend_ready(inode)) f2fs_warn(sbi, "compression algorithm is successfully set, " "but current kernel doesn't support this algorithm."); out: + f2fs_up_write(&F2FS_I(inode)->i_sem); inode_unlock(inode); file_end_write(filp); @@ -4079,10 +4106,8 @@ static int f2fs_ioc_decompress_file(struct file *filp) last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); count = last_idx - page_idx; - while (count) { - int len = min(cluster_size, count); - - ret = redirty_blocks(inode, page_idx, len); + while (count && count >= cluster_size) { + ret = redirty_blocks(inode, page_idx, cluster_size); if (ret < 0) break; @@ -4092,8 +4117,14 @@ static int f2fs_ioc_decompress_file(struct file *filp) break; } - count -= len; - page_idx += len; + count -= cluster_size; + page_idx += cluster_size; + + cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } } if (!ret) @@ -4153,10 +4184,8 @@ static int f2fs_ioc_compress_file(struct file *filp) last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); count = last_idx - page_idx; - while (count) { - int len = min(cluster_size, count); - - ret = redirty_blocks(inode, page_idx, len); + while (count && count >= cluster_size) { + ret = redirty_blocks(inode, page_idx, cluster_size); if (ret < 0) break; @@ -4166,8 +4195,14 @@ static int f2fs_ioc_compress_file(struct file *filp) break; } - count -= len; - page_idx += len; + count -= cluster_size; + page_idx += cluster_size; + + cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } } if (!ret) @@ -4579,6 +4614,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, dec_page_count(sbi, F2FS_DIO_WRITE); if (error) return error; + f2fs_update_time(sbi, REQ_TIME); f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size); return 0; } @@ -4823,6 +4859,9 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, filp->f_mode &= ~FMODE_RANDOM; spin_unlock(&filp->f_lock); return 0; + } else if (advice == POSIX_FADV_WILLNEED && offset == 0) { + /* Load extent cache at the first readahead. */ + f2fs_precache_extents(inode); } err = generic_fadvise(filp, offset, len, advice); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 01effd3fcb6c..f550cdeaa663 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -121,8 +121,8 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - if (!foreground) - stat_inc_bggc_count(sbi->stat_info); + stat_inc_gc_call_count(sbi, foreground ? + FOREGROUND : BACKGROUND); sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; @@ -1685,6 +1685,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, int seg_freed = 0, migrated = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; + unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE; int submitted = 0; if (__is_large_section(sbi)) @@ -1766,7 +1767,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, segno, gc_type, force_migrate); - stat_inc_seg_count(sbi, type, gc_type); + stat_inc_gc_seg_count(sbi, data_type, gc_type); sbi->gc_reclaimed_segs[sbi->gc_mode]++; migrated++; @@ -1783,12 +1784,12 @@ skip: } if (submitted) - f2fs_submit_merged_write(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA); + f2fs_submit_merged_write(sbi, data_type); blk_finish_plug(&plug); - stat_inc_call_count(sbi->stat_info); + if (migrated) + stat_inc_gc_sec_count(sbi, data_type, gc_type); return seg_freed; } @@ -1839,6 +1840,7 @@ gc_more: * secure free segments which doesn't need fggc any more. */ if (prefree_segments(sbi)) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1887,6 +1889,7 @@ retry: round++; if (skipped_round > MAX_SKIP_GC_COUNT && skipped_round * 2 >= round) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); goto stop; } @@ -1902,6 +1905,7 @@ retry: */ if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS && prefree_segments(sbi)) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -2029,6 +2033,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, if (gc_only) goto out; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) goto out; @@ -2181,12 +2186,14 @@ out_drop_write: if (err) return err; - err = freeze_super(sbi->sb); + err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE); if (err) return err; if (f2fs_readonly(sbi->sb)) { - thaw_super(sbi->sb); + err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); + if (err) + return err; return -EROFS; } @@ -2221,6 +2228,7 @@ out_drop_write: clear_sbi_flag(sbi, SBI_IS_RESIZEFS); set_sbi_flag(sbi, SBI_IS_DIRTY); + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) { update_fs_metadata(sbi, secs); @@ -2240,6 +2248,6 @@ recover_out: out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); - thaw_super(sbi->sb); + thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); return err; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4638fee16a91..ac00423f117b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -641,7 +641,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, } if (inode) { - f2fs_down_write(&F2FS_I(inode)->i_sem); + f2fs_down_write_nested(&F2FS_I(inode)->i_sem, + SINGLE_DEPTH_NESTING); page = f2fs_init_inode_metadata(inode, dir, fname, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); @@ -698,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 09e986b050c6..560bfcad1af2 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -214,7 +214,7 @@ static bool sanity_check_compress_inode(struct inode *inode, f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix", __func__, inode->i_ino, ri->i_compress_algorithm); - goto err; + return false; } if (le64_to_cpu(ri->i_compr_blocks) > SECTOR_TO_BLOCK(inode->i_blocks)) { @@ -222,14 +222,14 @@ static bool sanity_check_compress_inode(struct inode *inode, "%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", __func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks), SECTOR_TO_BLOCK(inode->i_blocks)); - goto err; + return false; } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix", __func__, inode->i_ino, ri->i_log_cluster_size); - goto err; + return false; } clevel = le16_to_cpu(ri->i_compress_flag) >> @@ -273,8 +273,6 @@ static bool sanity_check_compress_inode(struct inode *inode, err_level: f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix", __func__, inode->i_ino, clevel); -err: - set_sbi_flag(sbi, SBI_NEED_FSCK); return false; } @@ -287,14 +285,12 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); if (!iblocks) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", __func__, inode->i_ino, iblocks); return false; } if (ino_of_node(node_page) != nid_of_node(node_page)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", __func__, inode->i_ino, ino_of_node(node_page), nid_of_node(node_page)); @@ -303,7 +299,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { if (!f2fs_sb_has_extra_attr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", __func__, inode->i_ino); return false; @@ -311,7 +306,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE || fi->i_extra_isize % sizeof(__le32)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", __func__, inode->i_ino, fi->i_extra_isize, F2FS_TOTAL_EXTRA_ATTR_SIZE); @@ -321,8 +315,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) f2fs_has_inline_xattr(inode) && (!fi->i_inline_xattr_size || fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %lu", __func__, inode->i_ino, fi->i_inline_xattr_size, MAX_INLINE_XATTR_SIZE); return false; @@ -335,7 +328,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } } else if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, run fsck to fix.", __func__, inode->i_ino); return false; @@ -343,31 +335,26 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) if (!f2fs_sb_has_extra_attr(sbi)) { if (f2fs_sb_has_project_quota(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA); return false; } if (f2fs_sb_has_inode_chksum(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM); return false; } if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); return false; } if (f2fs_sb_has_inode_crtime(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME); return false; } if (f2fs_sb_has_compression(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_COMPRESSION); return false; @@ -375,21 +362,18 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (f2fs_sanity_check_inline_data(inode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", __func__, inode->i_ino); return false; @@ -402,9 +386,9 @@ static void init_idisk_time(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; - fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[0] = inode_get_atime(inode); + fi->i_disk_time[1] = inode_get_ctime(inode); + fi->i_disk_time[2] = inode_get_mtime(inode); } static int do_read_inode(struct inode *inode) @@ -433,12 +417,12 @@ static int do_read_inode(struct inode *inode) inode->i_size = le64_to_cpu(ri->i_size); inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); - inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); - inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); - inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + inode_set_atime(inode, le64_to_cpu(ri->i_atime), + le32_to_cpu(ri->i_atime_nsec)); + inode_set_ctime(inode, le64_to_cpu(ri->i_ctime), + le32_to_cpu(ri->i_ctime_nsec)); + inode_set_mtime(inode, le64_to_cpu(ri->i_mtime), + le32_to_cpu(ri->i_mtime_nsec)); inode->i_generation = le32_to_cpu(ri->i_generation); if (S_ISDIR(inode->i_mode)) fi->i_current_depth = le32_to_cpu(ri->i_current_depth); @@ -475,6 +459,13 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); + return -EFSCORRUPTED; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -544,12 +535,6 @@ static int do_read_inode(struct inode *inode) f2fs_init_read_extent_tree(inode, node_page); f2fs_init_age_extent_tree(inode); - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); - f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); - return -EFSCORRUPTED; - } - if (!sanity_check_extent_cache(inode)) { f2fs_put_page(node_page, 1); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); @@ -713,12 +698,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) } set_raw_inline(inode, ri); - ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ri->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + ri->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + ri->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + ri->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ri->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); if (S_ISDIR(inode->i_mode)) ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index bee0568888da..d0053b0284d8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -243,8 +243,8 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - F2FS_I(inode)->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + F2FS_I(inode)->i_crtime = inode_get_mtime(inode); inode->i_generation = get_random_u32(); if (S_ISDIR(inode->i_mode)) @@ -420,7 +420,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -1052,7 +1052,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_page = NULL; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -1086,7 +1086,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_i_pino_write(old_inode, new_dir->i_ino); f2fs_up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -1251,7 +1251,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_pino_write(old_inode, new_dir->i_ino); f2fs_up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = current_time(old_dir); + inode_set_ctime_current(old_dir); if (old_nlink) { f2fs_down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -1270,7 +1270,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_pino_write(new_inode, old_dir->i_ino); f2fs_up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = current_time(new_dir); + inode_set_ctime_current(new_dir); if (new_nlink) { f2fs_down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ee2e1dd64f25..6c7f6a649d27 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -633,7 +633,7 @@ static void f2fs_ra_node_pages(struct page *parent, int start, int n) /* Then, try readahead for siblings of the desired node */ end = start + n; - end = min(end, NIDS_PER_BLOCK); + end = min(end, (int)NIDS_PER_BLOCK); for (i = start; i < end; i++) { nid = get_nid(parent, i, false); f2fs_ra_node_page(sbi, nid); @@ -1467,7 +1467,8 @@ page_hit: ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); set_sbi_flag(sbi, SBI_NEED_FSCK); - err = -EINVAL; + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + err = -EFSCORRUPTED; out_err: ClearPageUptodate(page); out_put_err: @@ -2389,7 +2390,7 @@ static int scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); if (blk_addr == NEW_ADDR) - return -EINVAL; + return -EFSCORRUPTED; if (blk_addr == NULL_ADDR) { add_free_nid(sbi, start_nid, true, true); @@ -2504,7 +2505,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, if (ret) { f2fs_up_read(&nm_i->nat_tree_lock); - f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); + + if (ret == -EFSCORRUPTED) { + f2fs_err(sbi, "NAT is corrupt, run fsck to fix it"); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, + ERROR_INCONSISTENT_NAT); + } + return ret; } } @@ -2743,7 +2751,9 @@ recover_xnid: f2fs_update_inode_page(inode); /* 3: update and set xattr node page dirty */ - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + if (page) + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), + VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4e7d4ceeb084..b56d0f1078a7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -320,12 +320,12 @@ static int recover_inode(struct inode *inode, struct page *page) } f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); - inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); - inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode_set_atime(inode, le64_to_cpu(raw->i_atime), + le32_to_cpu(raw->i_atime_nsec)); + inode_set_ctime(inode, le64_to_cpu(raw->i_ctime), + le32_to_cpu(raw->i_ctime_nsec)); + inode_set_mtime(inode, le64_to_cpu(raw->i_mtime), + le32_to_cpu(raw->i_mtime_nsec)); F2FS_I(inode)->i_advise = raw->i_advise; F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); @@ -924,6 +924,7 @@ skip: struct cp_control cpc = { .reason = CP_RECOVERY, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0457d620011f..727d016318f9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -205,6 +205,8 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean) f2fs_i_size_write(inode, fi->original_i_size); fi->original_i_size = 0; } + /* avoid stale dirty inode during eviction */ + sync_inode_metadata(inode, 0); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, @@ -433,6 +435,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) .err_gc_skipped = false, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } } @@ -510,8 +513,8 @@ do_sync: mutex_unlock(&sbi->flush_lock); } + stat_inc_cp_call_count(sbi, BACKGROUND); f2fs_sync_fs(sbi->sb, 1); - stat_inc_bg_cp_count(sbi->stat_info); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, @@ -1258,8 +1261,16 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { - __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); - return 0; + int devi = f2fs_bdev_index(sbi, bdev); + + if (devi < 0) + return -EINVAL; + + if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { + __submit_zone_reset_cmd(sbi, dc, flag, + wait_list, issued); + return 0; + } } #endif @@ -1785,15 +1796,24 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) dc = __lookup_discard_cmd(sbi, blkaddr); #ifdef CONFIG_BLK_DEV_ZONED if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { - /* force submit zone reset */ - if (dc->state == D_PREP) - __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, - &dcc->wait_list, NULL); - dc->ref++; - mutex_unlock(&dcc->cmd_lock); - /* wait zone reset */ - __wait_one_discard_bio(sbi, dc); - return; + int devi = f2fs_bdev_index(sbi, dc->bdev); + + if (devi < 0) { + mutex_unlock(&dcc->cmd_lock); + return; + } + + if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { + /* force submit zone reset */ + if (dc->state == D_PREP) + __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, + &dcc->wait_list, NULL); + dc->ref++; + mutex_unlock(&dcc->cmd_lock); + /* wait zone reset */ + __wait_one_discard_bio(sbi, dc); + return; + } } #endif if (dc) { @@ -2193,7 +2213,7 @@ find_next: len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || - !force || len < cpc->trim_minlen) + (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, @@ -3228,6 +3248,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) goto out; f2fs_down_write(&sbi->gc_lock); + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); if (err) @@ -4846,17 +4867,17 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, { unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; block_t zone_block, wp_block, last_valid_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int i, s, b, ret; struct seg_entry *se; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = fdev->start_blk + (zone->wp >> sbi->log_sectors_per_block); + wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - zone_block = fdev->start_blk + (zone->start >> - sbi->log_sectors_per_block); + zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); @@ -4889,24 +4910,33 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, } /* - * The write pointer matches with the valid blocks or - * already points to the end of the zone. + * When safely unmounted in the previous mount, we can trust write + * pointers. Otherwise, finish zones. */ - if ((last_valid_block + 1 == wp_block) || - (zone->wp == zone->start + zone->len)) - return 0; - - if (last_valid_block + 1 == zone_block) { + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { /* - * If there is no valid block in the zone and if write pointer - * is not at zone start, reset the write pointer. + * The write pointer matches with the valid blocks or + * already points to the end of the zone. */ - f2fs_notice(sbi, - "Zone without valid block has non-zero write " - "pointer. Reset the write pointer: wp[0x%x,0x%x]", - wp_segno, wp_blkoff); + if ((last_valid_block + 1 == wp_block) || + (zone->wp == zone->start + zone->len)) + return 0; + } + + if (last_valid_block + 1 == zone_block) { + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + /* + * If there is no valid block in the zone and if write + * pointer is not at zone start, reset the write + * pointer. + */ + f2fs_notice(sbi, + "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: wp[0x%x,0x%x]", + wp_segno, wp_blkoff); + } ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, - zone->len >> sbi->log_sectors_per_block); + zone->len >> log_sectors_per_block); if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); @@ -4914,25 +4944,34 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, return ret; } - /* - * If there are valid blocks and the write pointer doesn't - * match with them, we need to report the inconsistency and - * fill the zone till the end to close the zone. This inconsistency - * does not cause write error because the zone will not be selected - * for write operation until it get discarded. - */ - f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: " - "valid block[0x%x,0x%x] wp[0x%x,0x%x]", - GET_SEGNO(sbi, last_valid_block), - GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), - wp_segno, wp_blkoff); - - ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, - zone->len - (zone->wp - zone->start), - GFP_NOFS, 0); - if (ret) - f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", - fdev->path, ret); + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + /* + * If there are valid blocks and the write pointer doesn't match + * with them, we need to report the inconsistency and fill + * the zone till the end to close the zone. This inconsistency + * does not cause write error because the zone will not be + * selected for write operation until it get discarded. + */ + f2fs_notice(sbi, "Valid blocks are not aligned with write " + "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + } + + ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, + zone->start, zone->len, GFP_NOFS); + if (ret == -EOPNOTSUPP) { + ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, + zone->len - (zone->wp - zone->start), + GFP_NOFS, 0); + if (ret) + f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", + fdev->path, ret); + } else if (ret) { + f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)", + fdev->path, ret); + } return ret; } @@ -4967,6 +5006,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) struct blk_zone zone; unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; block_t cs_zone_block, wp_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; sector_t zone_sector; int err; @@ -4978,8 +5018,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) return 0; /* report zone for the sector the curseg points to */ - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << - sbi->log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -4991,18 +5031,27 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = zbd->start_blk + (zone.wp >> sbi->log_sectors_per_block); - wp_segno = GET_SEGNO(sbi, wp_block); - wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - wp_sector_off = zone.wp & GENMASK(sbi->log_sectors_per_block - 1, 0); - - if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && - wp_sector_off == 0) - return 0; + /* + * When safely unmounted in the previous mount, we could use current + * segments. Otherwise, allocate new sections. + */ + if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + + if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && + wp_sector_off == 0) + return 0; - f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " - "curseg[0x%x,0x%x] wp[0x%x,0x%x]", - type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); + f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " + "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, + cs->next_blkoff, wp_segno, wp_blkoff); + } else { + f2fs_notice(sbi, "Not successfully unmounted in the previous " + "mount"); + } f2fs_notice(sbi, "Assign new section to curseg[%d]: " "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); @@ -5021,8 +5070,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (!zbd) return 0; - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << - sbi->log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -5040,7 +5089,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, - zone.len >> sbi->log_sectors_per_block); + zone.len >> log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 2ca8fb5d0dc4..8129be788bd5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -108,11 +108,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, ((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\ (sbi)->log_blocks_per_seg)) #define GET_SEC_FROM_SEG(sbi, segno) \ - (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ca31163da00a..033af907c3b1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -83,11 +83,26 @@ void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #endif /* f2fs-wide shrinker description */ -static struct shrinker f2fs_shrinker_info = { - .scan_objects = f2fs_shrink_scan, - .count_objects = f2fs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *f2fs_shrinker_info; + +static int __init f2fs_init_shrinker(void) +{ + f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker"); + if (!f2fs_shrinker_info) + return -ENOMEM; + + f2fs_shrinker_info->count_objects = f2fs_shrink_count; + f2fs_shrinker_info->scan_objects = f2fs_shrink_scan; + + shrinker_register(f2fs_shrinker_info); + + return 0; +} + +static void f2fs_exit_shrinker(void) +{ + shrinker_free(f2fs_shrinker_info); +} enum { Opt_gc_background, @@ -547,6 +562,29 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, } #ifdef CONFIG_F2FS_FS_COMPRESSION +static bool is_compress_extension_exist(struct f2fs_sb_info *sbi, + const char *new_ext, bool is_ext) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + int ext_cnt; + int i; + + if (is_ext) { + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + } else { + ext = F2FS_OPTION(sbi).noextensions; + ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + } + + for (i = 0; i < ext_cnt; i++) { + if (!strcasecmp(new_ext, ext[i])) + return true; + } + + return false; +} + /* * 1. The same extension name cannot not appear in both compress and non-compress extension * at the same time. @@ -591,7 +629,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) unsigned int level; if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL; + F2FS_OPTION(sbi).compress_level = 0; return 0; } @@ -862,11 +900,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "adaptive")) { - if (f2fs_sb_has_blkzoned(sbi)) { - f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature"); - kfree(name); - return -EINVAL; - } F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; @@ -1154,6 +1187,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } + if (is_compress_extension_exist(sbi, name, true)) { + kfree(name); + break; + } + strcpy(ext[ext_cnt], name); F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); @@ -1178,6 +1216,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } + if (is_compress_extension_exist(sbi, name, false)) { + kfree(name); + break; + } + strcpy(noext[noext_cnt], name); F2FS_OPTION(sbi).nocompress_ext_cnt++; kfree(name); @@ -1331,6 +1374,11 @@ default_check: F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; } + + if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) { + f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature"); + return -EINVAL; + } #else f2fs_err(sbi, "Zoned block device support is not enabled"); return -EINVAL; @@ -1561,7 +1609,8 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) int i; for (i = 0; i < sbi->s_ndevs; i++) { - blkdev_put(FDEV(i).bdev, sbi->sb->s_type); + if (i > 0) + bdev_release(FDEV(i).bdev_handle); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif @@ -1600,6 +1649,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } @@ -1609,6 +1659,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); } @@ -1626,7 +1677,7 @@ static void f2fs_put_super(struct super_block *sb) f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); - if (err) { + if (err || f2fs_cp_error(sbi)) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); } @@ -1705,8 +1756,10 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) + if (sync) { + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_issue_checkpoint(sbi); + } return err; } @@ -2205,6 +2258,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); + stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); if (err == -ENODATA) { err = 0; @@ -2230,6 +2284,7 @@ skip_gc: f2fs_down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); + stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); if (err) goto out_unlock; @@ -2279,9 +2334,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) unsigned long old_sb_flags; int err; bool need_restart_gc = false, need_stop_gc = false; - bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; bool need_restart_discard = false, need_stop_discard = false; + bool need_enable_checkpoint = false, need_disable_checkpoint = false; bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE); bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE); bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); @@ -2445,24 +2500,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || - !test_opt(sbi, MERGE_CHECKPOINT)) { - f2fs_stop_ckpt_thread(sbi); - need_restart_ckpt = true; - } else { - /* Flush if the prevous checkpoint, if exists. */ - f2fs_flush_ckpt_thread(sbi); - - err = f2fs_start_ckpt_thread(sbi); - if (err) { - f2fs_err(sbi, - "Failed to start F2FS issue_checkpoint_thread (%d)", - err); - goto restore_gc; - } - need_stop_ckpt = true; - } - /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -2474,7 +2511,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } else { err = f2fs_create_flush_cmd_control(sbi); if (err) - goto restore_ckpt; + goto restore_gc; need_stop_flush = true; } @@ -2496,8 +2533,31 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = f2fs_disable_checkpoint(sbi); if (err) goto restore_discard; + need_enable_checkpoint = true; } else { f2fs_enable_checkpoint(sbi); + need_disable_checkpoint = true; + } + } + + /* + * Place this routine at the end, since a new checkpoint would be + * triggered while remount and we need to take care of it before + * returning from remount. + */ + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + } else { + /* Flush if the prevous checkpoint, if exists. */ + f2fs_flush_ckpt_thread(sbi); + + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_checkpoint; } } @@ -2515,6 +2575,13 @@ skip: adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_checkpoint: + if (need_enable_checkpoint) { + f2fs_enable_checkpoint(sbi); + } else if (need_disable_checkpoint) { + if (f2fs_disable_checkpoint(sbi)) + f2fs_warn(sbi, "checkpoint has not been disabled"); + } restore_discard: if (need_restart_discard) { if (f2fs_start_discard_thread(sbi)) @@ -2530,13 +2597,6 @@ restore_flush: clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); } -restore_ckpt: - if (need_restart_ckpt) { - if (f2fs_start_ckpt_thread(sbi)) - f2fs_warn(sbi, "background ckpt thread has stopped"); - } else if (need_stop_ckpt) { - f2fs_stop_ckpt_thread(sbi); - } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) @@ -2703,7 +2763,7 @@ retry: if (len == towrite) return err; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; } @@ -3196,13 +3256,6 @@ static bool f2fs_has_stable_inodes(struct super_block *sb) return true; } -static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(nid_t); - *lblk_bits_ret = 8 * sizeof(block_t); -} - static struct block_device **f2fs_get_devices(struct super_block *sb, unsigned int *num_devs) { @@ -3224,13 +3277,15 @@ static struct block_device **f2fs_get_devices(struct super_block *sb, } static const struct fscrypt_operations f2fs_cryptops = { - .key_prefix = "f2fs:", + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, .has_stable_inodes = f2fs_has_stable_inodes, - .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, .get_devices = f2fs_get_devices, }; #endif @@ -3275,6 +3330,7 @@ static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid, } static const struct export_operations f2fs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = f2fs_fh_to_dentry, .fh_to_parent = f2fs_fh_to_parent, .get_parent = f2fs_get_parent, @@ -3462,7 +3518,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - /* Currently, support 512/1024/2048/4096 bytes sector size */ + /* Currently, support 512/1024/2048/4096/16K bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || le32_to_cpu(raw_super->log_sectorsize) < @@ -4190,16 +4246,12 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) sbi->aligned_blksize = true; for (i = 0; i < max_devices; i++) { - - if (i > 0 && !RDEV(i).path[0]) + if (i == 0) + FDEV(0).bdev_handle = sbi->sb->s_bdev_handle; + else if (!RDEV(i).path[0]) break; - if (max_devices == 1) { - /* Single zoned block device mount */ - FDEV(0).bdev = - blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode, - sbi->sb->s_type, NULL); - } else { + if (max_devices > 1) { /* Multi-device mount */ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); FDEV(i).total_segments = @@ -4215,14 +4267,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).end_blk = FDEV(i).start_blk + (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; + FDEV(i).bdev_handle = bdev_open_by_path( + FDEV(i).path, mode, sbi->sb, NULL); } - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode, - sbi->sb->s_type, - NULL); } - if (IS_ERR(FDEV(i).bdev)) - return PTR_ERR(FDEV(i).bdev); + if (IS_ERR(FDEV(i).bdev_handle)) + return PTR_ERR(FDEV(i).bdev_handle); + FDEV(i).bdev = FDEV(i).bdev_handle->bdev; /* to release errored devices */ sbi->s_ndevs = i + 1; @@ -4871,6 +4923,7 @@ static void kill_f2fs_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; + stat_inc_cp_call_count(sbi, TOTAL_CALL); f2fs_write_checkpoint(sbi, &cpc); } @@ -4912,7 +4965,7 @@ static int __init init_f2fs_fs(void) int err; if (PAGE_SIZE != F2FS_BLKSIZE) { - printk("F2FS not supported on PAGE_SIZE(%lu) != %d\n", + printk("F2FS not supported on PAGE_SIZE(%lu) != BLOCK_SIZE(%lu)\n", PAGE_SIZE, F2FS_BLKSIZE); return -EINVAL; } @@ -4941,7 +4994,7 @@ static int __init init_f2fs_fs(void) err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; - err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); + err = f2fs_init_shrinker(); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); @@ -4986,7 +5039,7 @@ free_root_stats: f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); free_shrinker: - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); free_garbage_collection_cache: @@ -5018,7 +5071,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 48b7e0073884..417fae96890f 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -356,6 +356,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, if (!strcmp(a->attr.name, "revoked_atomic_block")) return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block); +#ifdef CONFIG_F2FS_STAT_FS + if (!strcmp(a->attr.name, "cp_foreground_calls")) + return sysfs_emit(buf, "%d\n", + atomic_read(&sbi->cp_call_count[TOTAL_CALL]) - + atomic_read(&sbi->cp_call_count[BACKGROUND])); + if (!strcmp(a->attr.name, "cp_background_calls")) + return sysfs_emit(buf, "%d\n", + atomic_read(&sbi->cp_call_count[BACKGROUND])); +#endif + ui = (unsigned int *)(ptr + a->offset); return sysfs_emit(buf, "%u\n", *ui); @@ -972,10 +982,10 @@ F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS -STAT_INFO_RO_ATTR(cp_foreground_calls, cp_count); -STAT_INFO_RO_ATTR(cp_background_calls, bg_cp_count); -STAT_INFO_RO_ATTR(gc_foreground_calls, call_count); -STAT_INFO_RO_ATTR(gc_background_calls, bg_gc); +STAT_INFO_RO_ATTR(cp_foreground_calls, cp_call_count[FOREGROUND]); +STAT_INFO_RO_ATTR(cp_background_calls, cp_call_count[BACKGROUND]); +STAT_INFO_RO_ATTR(gc_foreground_calls, gc_call_count[FOREGROUND]); +STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]); #endif /* FAULT_INFO ATTR */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 476b186b90a6..47e88b4d4e7d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -189,7 +189,7 @@ const struct xattr_handler f2fs_xattr_security_handler = { .set = f2fs_xattr_generic_set, }; -static const struct xattr_handler *f2fs_xattr_handler_map[] = { +static const struct xattr_handler * const f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, @@ -202,7 +202,7 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, }; -const struct xattr_handler *f2fs_xattr_handlers[] = { +const struct xattr_handler * const f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY @@ -364,10 +364,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - err = -EFSCORRUPTED; + err = -ENODATA; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); goto out; @@ -584,13 +584,12 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - error = -EFSCORRUPTED; f2fs_handle_error(F2FS_I_SB(inode), ERROR_CORRUPTED_XATTR); - goto cleanup; + break; } if (!prefix) @@ -650,7 +649,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - +retry: error = read_all_xattrs(inode, ipage, &base_addr); if (error) return error; @@ -660,7 +659,14 @@ static int __f2fs_setxattr(struct inode *inode, int index, /* find entry with wanted name. */ here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", + if (!F2FS_I(inode)->i_xattr_nid) { + f2fs_notice(F2FS_I_SB(inode), + "recover xattr in inode (%lu)", inode->i_ino); + f2fs_recover_xattr_data(inode, NULL); + kfree(base_addr); + goto retry; + } + f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; @@ -757,17 +763,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode, true); - if (!error && S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); same: if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); clear_inode_flag(inode, FI_ACL_MODE); } + inode_set_ctime_current(inode); + f2fs_mark_inode_dirty_sync(inode, true); exit: kfree(base_addr); return error; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index b1811c392e6f..a005ffdcf717 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -125,7 +125,7 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; extern const struct xattr_handler f2fs_xattr_security_handler; -extern const struct xattr_handler *f2fs_xattr_handlers[]; +extern const struct xattr_handler * const f2fs_xattr_handlers[]; extern int f2fs_setxattr(struct inode *, int, const char *, const void *, size_t, struct page *, int); diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index afe83b4e7172..25fae1c83725 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config FAT_FS tristate + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e3b690b48e3e..66cf4778cf3b 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -460,8 +460,7 @@ extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); -extern int fat_update_time(struct inode *inode, struct timespec64 *now, - int flags); +extern int fat_update_time(struct inode *inode, int flags); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); diff --git a/fs/fat/file.c b/fs/fat/file.c index 456477946dd9..e887e9ab7472 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -401,7 +401,7 @@ int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_inode(path->dentry); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->blksize = sbi->cluster_size; if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d99b8549ec8f..1fac3dabf130 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -512,6 +512,7 @@ static int fat_validate_dir(struct inode *dir) int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct timespec64 mtime; int error; MSDOS_I(inode)->i_pos = 0; @@ -561,14 +562,18 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; - fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); - inode->i_ctime = inode->i_mtime; + fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0); + inode_set_mtime_to_ts(inode, mtime); + inode_set_ctime_to_ts(inode, mtime); if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + struct timespec64 atime; + + fat_time_fat2unix(sbi, &atime, 0, de->adate, 0); + inode_set_atime_to_ts(inode, atime); fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, de->cdate, de->ctime_cs); } else - inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime); + inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime)); return 0; } @@ -849,6 +854,7 @@ static int __fat_write_inode(struct inode *inode, int wait) struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *raw_entry; + struct timespec64 mtime; loff_t i_pos; sector_t blocknr; int err, offset; @@ -882,12 +888,14 @@ retry: raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); - fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, + mtime = inode_get_mtime(inode); + fat_time_unix2fat(sbi, &mtime, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { + struct timespec64 ts = inode_get_atime(inode); __le16 atime; - fat_time_unix2fat(sbi, &inode->i_atime, &atime, - &raw_entry->adate, NULL); + + fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL); fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); } @@ -1407,8 +1415,8 @@ static int fat_read_root(struct inode *inode) MSDOS_I(inode)->mmu_private = inode->i_size; fat_save_attrs(inode, ATTR_DIR); - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; - inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0))); set_nlink(inode, fat_subdirs(inode)+2); return 0; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 7e5d6ae305f2..c7a2d27120ba 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -325,20 +325,21 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) } if (flags & S_ATIME) - inode->i_atime = fat_truncate_atime(sbi, now); + inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now)); /* * ctime and mtime share the same on-disk field, and should be * identical in memory. all mtime updates will be applied to ctime, * but ctime updates are ignored. */ if (flags & S_MTIME) - inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now); + inode_set_mtime_to_ts(inode, + inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now))); return 0; } EXPORT_SYMBOL_GPL(fat_truncate_time); -int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) +int fat_update_time(struct inode *inode, int flags) { int dirty_flags = 0; @@ -346,16 +347,13 @@ int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) return 0; if (flags & (S_ATIME | S_CTIME | S_MTIME)) { - fat_truncate_time(inode, now, flags); + fat_truncate_time(inode, NULL, flags); if (inode->i_sb->s_flags & SB_LAZYTIME) dirty_flags |= I_DIRTY_TIME; else dirty_flags |= I_DIRTY_SYNC; } - if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) - dirty_flags |= I_DIRTY_SYNC; - __mark_inode_dirty(inode, dirty_flags); return 0; } diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 3626eb585a98..c52e63e10d35 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -279,6 +279,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir) } const struct export_operations fat_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = fat_fh_to_dentry, .fh_to_parent = fat_fh_to_parent, .get_parent = fat_get_parent, diff --git a/fs/fcntl.c b/fs/fcntl.c index b622be119706..c80a6acad742 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -34,7 +34,7 @@ #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) -static int setfl(int fd, struct file * filp, unsigned long arg) +static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; @@ -112,11 +112,11 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; - int who = arg, ret = 0; + int ret = 0; type = PIDTYPE_TGID; if (who < 0) { @@ -317,28 +317,29 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_DUPFD: - err = f_dupfd(arg, filp, 0); + err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: - err = f_dupfd(arg, filp, O_CLOEXEC); + err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; - set_close_on_exec(fd, arg & FD_CLOEXEC); + set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: - err = setfl(fd, filp, arg); + err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -375,7 +376,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -391,28 +392,28 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_SETSIG: /* arg == 0 restores default behaviour. */ - if (!valid_signal(arg)) { + if (!valid_signal(argi)) { break; } err = 0; - filp->f_owner.signum = arg; + filp->f_owner.signum = argi; break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: - err = fcntl_setlease(fd, filp, arg); + err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: - err = fcntl_dirnotify(fd, filp, arg); + err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: - err = pipe_fcntl(filp, cmd, arg); + err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: - err = memfd_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: case F_SET_RW_HINT: @@ -843,7 +844,7 @@ int send_sigurg(struct fown_struct *fown) } static DEFINE_SPINLOCK(fasync_lock); -static struct kmem_cache *fasync_cache __read_mostly; +static struct kmem_cache *fasync_cache __ro_after_init; static void fasync_free_rcu(struct rcu_head *head) { diff --git a/fs/fhandle.c b/fs/fhandle.c index 6ea8d35a9382..18b3ba8dc8ea 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -26,12 +26,8 @@ static long do_sys_name_to_handle(const struct path *path, /* * We need to make sure whether the file system support decoding of * the file handle if decodeable file handle was requested. - * Otherwise, even empty export_operations are sufficient to opt-in - * to encoding FIDs. */ - if (!path->dentry->d_sb->s_export_op || - (!(fh_flags & EXPORT_FH_FID) && - !path->dentry->d_sb->s_export_op->fh_to_dentry)) + if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags)) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) diff --git a/fs/file.c b/fs/file.c index 3fd003a8604f..5fb0b146e79e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file) struct files_struct *files = current->files; struct fdtable *fdt; + if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) + return; + rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { @@ -668,7 +671,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */ /** * last_fd - return last valid index into fd table - * @cur_fds: files struct + * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * @@ -693,29 +696,30 @@ static inline void __range_cloexec(struct files_struct *cur_fds, spin_unlock(&cur_fds->file_lock); } -static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, +static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { + struct file *file; unsigned n; - rcu_read_lock(); - n = last_fd(files_fdtable(cur_fds)); - rcu_read_unlock(); + spin_lock(&files->file_lock); + n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); - while (fd <= max_fd) { - struct file *file; - - spin_lock(&cur_fds->file_lock); - file = pick_file(cur_fds, fd++); - spin_unlock(&cur_fds->file_lock); - + for (; fd <= max_fd; fd++) { + file = pick_file(files, fd); if (file) { - /* found a valid file to close */ - filp_close(file, cur_fds); + spin_unlock(&files->file_lock); + filp_close(file, files); cond_resched(); + spin_lock(&files->file_lock); + } else if (need_resched()) { + spin_unlock(&files->file_lock); + cond_resched(); + spin_lock(&files->file_lock); } } + spin_unlock(&files->file_lock); } /** @@ -723,6 +727,7 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close + * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. @@ -851,8 +856,104 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } +static struct file *__get_file_rcu(struct file __rcu **f) +{ + struct file __rcu *file; + struct file __rcu *file_reloaded; + struct file __rcu *file_reloaded_cmp; + + file = rcu_dereference_raw(*f); + if (!file) + return NULL; + + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + return ERR_PTR(-EAGAIN); + + file_reloaded = rcu_dereference_raw(*f); + + /* + * Ensure that all accesses have a dependency on the load from + * rcu_dereference_raw() above so we get correct ordering + * between reuse/allocation and the pointer check below. + */ + file_reloaded_cmp = file_reloaded; + OPTIMIZER_HIDE_VAR(file_reloaded_cmp); + + /* + * atomic_long_inc_not_zero() above provided a full memory + * barrier when we acquired a reference. + * + * This is paired with the write barrier from assigning to the + * __rcu protected file pointer so that if that pointer still + * matches the current file, we know we have successfully + * acquired a reference to the right file. + * + * If the pointers don't match the file has been reallocated by + * SLAB_TYPESAFE_BY_RCU. + */ + if (file == file_reloaded_cmp) + return file_reloaded; + + fput(file); + return ERR_PTR(-EAGAIN); +} + +/** + * get_file_rcu - try go get a reference to a file under rcu + * @f: the file to get a reference on + * + * This function tries to get a reference on @f carefully verifying that + * @f hasn't been reused. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_rcu(struct file __rcu **f) +{ + for (;;) { + struct file __rcu *file; + + file = __get_file_rcu(f); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) + continue; + + return file; + } +} +EXPORT_SYMBOL_GPL(get_file_rcu); + +/** + * get_file_active - try go get a reference to a file + * @f: the file to get a reference on + * + * In contast to get_file_rcu() the pointer itself isn't part of the + * reference counting. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_active(struct file **f) +{ + struct file __rcu *file; + + rcu_read_lock(); + file = __get_file_rcu(f); + rcu_read_unlock(); + if (IS_ERR(file)) + file = NULL; + return file; +} +EXPORT_SYMBOL_GPL(get_file_active); + static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; @@ -863,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, return NULL; fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); - file = rcu_dereference_raw(*fdentry); - if (unlikely(!file)) - return NULL; - - if (unlikely(file->f_mode & mask)) - return NULL; /* * Ok, we have a file pointer. However, because we do @@ -877,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * Such a race can take two forms: * - * (a) the file ref already went down to zero, - * and get_file_rcu() fails. Just try again: + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. */ - if (unlikely(!get_file_rcu(file))) + file = __get_file_rcu(fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) continue; /* @@ -891,13 +991,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || - unlikely(rcu_dereference_raw(*fdentry) != file)) { + if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* + * This isn't the file we're looking for or we're not + * allowed to get a reference to it. + */ + if (unlikely(file->f_mode & mask)) { + fput(file); + return NULL; + } + + /* * Ok, we have a ref to the file, and checked that it * still exists. */ @@ -946,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) +struct file *lookup_fdget_rcu(unsigned int fd) +{ + return __fget_files_rcu(current->files, fd, 0); + +} +EXPORT_SYMBOL_GPL(lookup_fdget_rcu); + +struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -955,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) task_lock(task); files = task->files; if (files) - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); task_unlock(task); return file; } -struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -972,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret files = task->files; if (files) { for (; fd < files_fdtable(files)->max_fds; fd++) { - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); if (file) break; } @@ -981,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fd_rcu); +EXPORT_SYMBOL(task_lookup_next_fdget_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1270,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; + struct file *f; int retval = oldfd; rcu_read_lock(); - if (!files_lookup_fd_rcu(files, oldfd)) + f = __fget_files_rcu(files, oldfd, 0); + if (!f) retval = -EBADF; rcu_read_unlock(); + if (f) + fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); diff --git a/fs/file_table.c b/fs/file_table.c index fc7d677ff5ad..de4a2915bfd4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -40,14 +40,14 @@ static struct files_stat_struct files_stat = { }; /* SLAB cache for file structures */ -static struct kmem_cache *filp_cachep __read_mostly; +static struct kmem_cache *filp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* Container for backing file with optional real path */ +/* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path real_path; + struct path user_path; }; static inline struct backing_file *backing_file(struct file *f) @@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f) return container_of(f, struct backing_file, file); } -struct path *backing_file_real_path(struct file *f) +struct path *backing_file_user_path(struct file *f) { - return &backing_file(f)->real_path; + return &backing_file(f)->user_path; } -EXPORT_SYMBOL_GPL(backing_file_real_path); +EXPORT_SYMBOL_GPL(backing_file_user_path); -static void file_free_rcu(struct rcu_head *head) +static inline void file_free(struct file *f) { - struct file *f = container_of(head, struct file, f_rcuhead); - + security_file_free(f); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + percpu_counter_dec(&nr_files); put_cred(f->f_cred); - if (unlikely(f->f_mode & FMODE_BACKING)) + if (unlikely(f->f_mode & FMODE_BACKING)) { + path_put(backing_file_user_path(f)); kfree(backing_file(f)); - else + } else { kmem_cache_free(filp_cachep, f); + } } -static inline void file_free(struct file *f) +void release_empty_file(struct file *f) { - security_file_free(f); - if (unlikely(f->f_mode & FMODE_BACKING)) - path_put(backing_file_real_path(f)); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) - percpu_counter_dec(&nr_files); - call_rcu(&f->f_rcuhead, file_free_rcu); + WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); + if (atomic_long_dec_and_test(&f->f_count)) { + security_file_free(f); + put_cred(f->f_cred); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + percpu_counter_dec(&nr_files); + kmem_cache_free(filp_cachep, f); + } } /* @@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) return error; } - atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); @@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ + /* + * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While + * fget-rcu pattern users need to be able to handle spurious + * refcount bumps we should reinitialize the reused file first. + */ + atomic_long_set(&f->f_count, 1); return 0; } @@ -461,11 +471,8 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { - struct task_struct *task = current; - BUG_ON(!(task->flags & PF_KTHREAD)); + if (atomic_long_dec_and_test(&file->f_count)) __fput(file); - } } EXPORT_SYMBOL(fput); @@ -474,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); + SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | + SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index 0e2fc08f7de4..912107ebea6f 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -2,6 +2,7 @@ config VXFS_FS tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" depends on BLOCK + select BUFFER_HEAD help FreeVxFS is a file system driver that support the VERITAS VxFS(TM) file system format. VERITAS VxFS(TM) is the standard file system diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ceb6a12649ba..20600e9ea202 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -109,12 +109,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi, set_nlink(inode, vip->vii_nlink); inode->i_size = vip->vii_size; - inode->i_atime.tv_sec = vip->vii_atime; - inode->i_ctime.tv_sec = vip->vii_ctime; - inode->i_mtime.tv_sec = vip->vii_mtime; - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; + inode_set_atime(inode, vip->vii_atime, 0); + inode_set_ctime(inode, vip->vii_ctime, 0); + inode_set_mtime(inode, vip->vii_mtime, 0); inode->i_blocks = vip->vii_blocks; inode->i_generation = vip->vii_gen; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 310d73e254df..e6e2a2185e7c 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -76,6 +76,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) { struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); struct vxfs_sb *raw_sb = infp->vsi_raw; + u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); bufp->f_type = VXFS_SUPER_MAGIC; bufp->f_bsize = dentry->d_sb->s_blocksize; @@ -84,6 +85,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) bufp->f_bavail = 0; bufp->f_files = 0; bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree); + bufp->f_fsid = u64_to_fsid(id); bufp->f_namelen = VXFS_NAMELEN; return 0; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aca4b4811394..1767493dffda 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -613,6 +613,24 @@ out_free: kfree(isw); } +static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, + struct list_head *list, int *nr) +{ + struct inode *inode; + + list_for_each_entry(inode, list, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[*nr] = inode; + (*nr)++; + + if (*nr >= WB_MAX_INODES_PER_ISW - 1) + return true; + } + return false; +} + /** * cleanup_offline_cgwb - detach associated inodes * @wb: target wb @@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; - struct inode *inode; int nr; bool restart = false; @@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) nr = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_attached, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) - continue; - - isw->inodes[nr++] = inode; - - if (nr >= WB_MAX_INODES_PER_ISW - 1) { - restart = true; - break; - } - } + /* + * In addition to the inodes that have completed writeback, also switch + * cgwbs for those inodes only with dirty timestamps. Otherwise, those + * inodes won't be written back for a long time when lazytime is + * enabled, and thus pinning the dying cgwbs. It won't break the + * bandwidth restrictions, as writeback of inode metadata is not + * accounted for. + */ + restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + if (!restart) + restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ @@ -1535,10 +1552,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, if (wbc->pages_skipped) { /* - * writeback is not making progress due to locked - * buffers. Skip this inode for now. + * Writeback is not making progress due to locked buffers. + * Skip this inode for now. Although having skipped pages + * is odd for clean inodes, it can happen for some + * filesystems so handle that gracefully. */ - redirty_tail_locked(inode, wb); + if (inode->i_state & I_DIRTY_ALL) + redirty_tail_locked(inode, wb); + else + inode_cgwb_move_to_attached(inode, wb); return; } @@ -1953,9 +1975,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; - if (!trylock_super(sb)) { + if (!super_trylock_shared(sb)) { /* - * trylock_super() may fail consistently due to + * super_trylock_shared() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ diff --git a/fs/fs_context.c b/fs/fs_context.c index 851214d1d013..98589aae5208 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param); /** * vfs_parse_fs_string - Convenience function to just parse a string. + * @fc: Filesystem context. + * @key: Parameter name. + * @value: Default value. + * @v_size: Maximum number of bytes in the value. */ int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size) @@ -188,17 +192,19 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key, EXPORT_SYMBOL(vfs_parse_fs_string); /** - * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data - * @ctx: The superblock configuration to fill in. + * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data + * @fc: The superblock configuration to fill in. * @data: The data to parse + * @sep: callback for separating next option * - * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be - * called from the ->monolithic_mount_data() fs_context operation. + * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom + * option separator callback. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ -int generic_parse_monolithic(struct fs_context *fc, void *data) +int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, + char *(*sep)(char **)) { char *options = data, *key; int ret = 0; @@ -210,7 +216,7 @@ int generic_parse_monolithic(struct fs_context *fc, void *data) if (ret) return ret; - while ((key = strsep(&options, ",")) != NULL) { + while ((key = sep(&options)) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); @@ -229,6 +235,28 @@ int generic_parse_monolithic(struct fs_context *fc, void *data) return ret; } +EXPORT_SYMBOL(vfs_parse_monolithic_sep); + +static char *vfs_parse_comma_sep(char **s) +{ + return strsep(s, ","); +} + +/** + * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * @fc: The superblock configuration to fill in. + * @data: The data to parse + * + * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be + * called from the ->monolithic_mount_data() fs_context operation. + * + * Returns 0 on success or the error returned by the ->parse_option() fs_context + * operation on failure. + */ +int generic_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep); +} EXPORT_SYMBOL(generic_parse_monolithic); /** @@ -315,10 +343,31 @@ struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, } EXPORT_SYMBOL(fs_context_for_reconfigure); +/** + * fs_context_for_submount: allocate a new fs_context for a submount + * @type: file_system_type of the new context + * @reference: reference dentry from which to copy relevant info + * + * Allocate a new fs_context suitable for a submount. This also ensures that + * the fc->security object is inherited from @reference (if needed). + */ struct fs_context *fs_context_for_submount(struct file_system_type *type, struct dentry *reference) { - return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + struct fs_context *fc; + int ret; + + fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + if (IS_ERR(fc)) + return fc; + + ret = security_fs_context_submount(fc, reference->d_sb); + if (ret) { + put_fs_context(fc); + return ERR_PTR(ret); + } + + return fc; } EXPORT_SYMBOL(fs_context_for_submount); @@ -333,7 +382,7 @@ void fc_drop_locked(struct fs_context *fc) static void legacy_fs_context_free(struct fs_context *fc); /** - * vfs_dup_fc_config: Duplicate a filesystem context. + * vfs_dup_fs_context - Duplicate a filesystem context. * @src_fc: The context to copy. */ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) @@ -379,7 +428,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context); /** * logfc - Log a message to a filesystem context - * @fc: The filesystem context to log to. + * @log: The filesystem context to log to, or NULL to use printk. + * @prefix: A string to prefix the output with, or NULL. + * @level: 'w' for a warning, 'e' for an error. Anything else is a notice. * @fmt: The format of the buffer. */ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...) @@ -692,6 +743,7 @@ void vfs_clean_context(struct fs_context *fc) security_free_mnt_opts(&fc->security); kfree(fc->source); fc->source = NULL; + fc->exclusive = false; fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; fc->phase = FS_CONTEXT_AWAITING_RECONF; diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 04b3f5b9c629..64c2d0814ed6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -62,7 +62,7 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) int count = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { task_lock(p); fs = p->fs; if (fs) { @@ -79,7 +79,7 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root) spin_unlock(&fs->lock); } task_unlock(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); while (count--) path_put(old_root); diff --git a/fs/fsopen.c b/fs/fsopen.c index fc9d2d9fd234..6593ae518115 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -209,6 +209,72 @@ err: return ret; } +static int vfs_cmd_create(struct fs_context *fc, bool exclusive) +{ + struct super_block *sb; + int ret; + + if (fc->phase != FS_CONTEXT_CREATE_PARAMS) + return -EBUSY; + + if (!mount_capable(fc)) + return -EPERM; + + /* require the new mount api */ + if (exclusive && fc->ops == &legacy_fs_context_ops) + return -EOPNOTSUPP; + + fc->phase = FS_CONTEXT_CREATING; + fc->exclusive = exclusive; + + ret = vfs_get_tree(fc); + if (ret) { + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + sb = fc->root->d_sb; + ret = security_sb_kern_mount(sb); + if (unlikely(ret)) { + fc_drop_locked(fc); + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + /* vfs_get_tree() callchains will have grabbed @s_umount */ + up_write(&sb->s_umount); + fc->phase = FS_CONTEXT_AWAITING_MOUNT; + return 0; +} + +static int vfs_cmd_reconfigure(struct fs_context *fc) +{ + struct super_block *sb; + int ret; + + if (fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + + fc->phase = FS_CONTEXT_RECONFIGURING; + + sb = fc->root->d_sb; + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + fc->phase = FS_CONTEXT_FAILED; + return -EPERM; + } + + down_write(&sb->s_umount); + ret = reconfigure_super(fc); + up_write(&sb->s_umount); + if (ret) { + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + vfs_clean_context(fc); + return 0; +} + /* * Check the state and apply the configuration. Note that this function is * allowed to 'steal' the value by setting param->xxx to NULL before returning. @@ -216,7 +282,6 @@ err: static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, struct fs_parameter *param) { - struct super_block *sb; int ret; ret = finish_clean_context(fc); @@ -224,39 +289,11 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, return ret; switch (cmd) { case FSCONFIG_CMD_CREATE: - if (fc->phase != FS_CONTEXT_CREATE_PARAMS) - return -EBUSY; - if (!mount_capable(fc)) - return -EPERM; - fc->phase = FS_CONTEXT_CREATING; - ret = vfs_get_tree(fc); - if (ret) - break; - sb = fc->root->d_sb; - ret = security_sb_kern_mount(sb); - if (unlikely(ret)) { - fc_drop_locked(fc); - break; - } - up_write(&sb->s_umount); - fc->phase = FS_CONTEXT_AWAITING_MOUNT; - return 0; + return vfs_cmd_create(fc, false); + case FSCONFIG_CMD_CREATE_EXCL: + return vfs_cmd_create(fc, true); case FSCONFIG_CMD_RECONFIGURE: - if (fc->phase != FS_CONTEXT_RECONF_PARAMS) - return -EBUSY; - fc->phase = FS_CONTEXT_RECONFIGURING; - sb = fc->root->d_sb; - if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { - ret = -EPERM; - break; - } - down_write(&sb->s_umount); - ret = reconfigure_super(fc); - up_write(&sb->s_umount); - if (ret) - break; - vfs_clean_context(fc); - return 0; + return vfs_cmd_reconfigure(fc); default: if (fc->phase != FS_CONTEXT_CREATE_PARAMS && fc->phase != FS_CONTEXT_RECONF_PARAMS) @@ -264,8 +301,6 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, return vfs_parse_fs_param(fc, param); } - fc->phase = FS_CONTEXT_FAILED; - return ret; } /** @@ -353,6 +388,7 @@ SYSCALL_DEFINE5(fsconfig, return -EINVAL; break; case FSCONFIG_CMD_CREATE: + case FSCONFIG_CMD_CREATE_EXCL: case FSCONFIG_CMD_RECONFIGURE: if (_key || _value || aux) return -EINVAL; @@ -429,6 +465,7 @@ SYSCALL_DEFINE5(fsconfig, param.file = fget(aux); if (!param.file) goto out_key; + param.dirfd = aux; break; default: break; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 247ef4f76761..284a35006462 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 8e74f278a3f6..12ef91d170bb 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -784,8 +784,8 @@ static int fuse_dax_writepages(struct address_space *mapping, return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); } -static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, bool write) +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, + bool write) { vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -809,7 +809,7 @@ retry: * to populate page cache or access memory we are trying to free. */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; @@ -818,7 +818,7 @@ retry: } if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) @@ -829,24 +829,22 @@ retry: static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, - vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE); } -static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { - return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static const struct vm_operations_struct fuse_dax_vm_ops = { @@ -1224,6 +1222,7 @@ void fuse_dax_conn_free(struct fuse_conn *fc) if (fc->dax) { fuse_free_dax_mem_ranges(&fc->dax->free_ranges); kfree(fc->dax); + fc->dax = NULL; } } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f67bef9d83c4..d19cbf34c634 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -92,7 +92,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) /* * Calculate the time in jiffies until a dentry/attributes are valid */ -static u64 time_to_jiffies(u64 sec, u32 nsec) +u64 fuse_time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { struct timespec64 ts = { @@ -112,17 +112,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, - time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); -} - -static u64 attr_timeout(struct fuse_attr_out *o) -{ - return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); -} - -u64 entry_attr_timeout(struct fuse_entry_out *o) -{ - return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); + fuse_time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); } void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) @@ -265,8 +255,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto invalid; forget_all_cached_acls(inode); - fuse_change_attributes(inode, &outarg.attr, - entry_attr_timeout(&outarg), + fuse_change_attributes(inode, &outarg.attr, NULL, + ATTR_TIMEOUT(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { @@ -360,10 +350,14 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } +static bool fuse_valid_size(u64 size) +{ + return size <= LLONG_MAX; +} + bool fuse_invalid_attr(struct fuse_attr *attr) { - return !fuse_valid_type(attr->mode) || - attr->size > LLONG_MAX; + return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size); } int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, @@ -399,7 +393,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out_put_forget; *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, - &outarg->attr, entry_attr_timeout(outarg), + &outarg->attr, ATTR_TIMEOUT(outarg), attr_version); err = -ENOMEM; if (!*inode) { @@ -686,7 +680,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, entry_attr_timeout(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -755,7 +749,8 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (err == -ENOSYS) { fc->no_create = 1; goto mknod; - } + } else if (err == -EEXIST) + fuse_invalidate_entry(entry); out_dput: dput(res); return err; @@ -813,7 +808,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, entry_attr_timeout(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; @@ -835,6 +830,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return 0; out_put_forget_req: + if (err == -EEXIST) + fuse_invalidate_entry(entry); kfree(forget); return err; } @@ -933,7 +930,7 @@ void fuse_flush_time_update(struct inode *inode) static void fuse_update_ctime_in_cache(struct inode *inode) { if (!IS_NOCMTIME(inode)) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty_sync(inode); fuse_flush_time_update(inode); } @@ -986,7 +983,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); - } else if (err == -EINTR) + } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } @@ -1009,7 +1006,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); - } else if (err == -EINTR) + } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } @@ -1050,7 +1047,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) fuse_entry_unlinked(newent); - } else if (err == -EINTR) { + } else if (err == -EINTR || err == -ENOENT) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation fails (e.g. some process has CWD under the renamed @@ -1153,6 +1150,87 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->blksize = 1 << blkbits; } +static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) +{ + memset(attr, 0, sizeof(*attr)); + attr->ino = sx->ino; + attr->size = sx->size; + attr->blocks = sx->blocks; + attr->atime = sx->atime.tv_sec; + attr->mtime = sx->mtime.tv_sec; + attr->ctime = sx->ctime.tv_sec; + attr->atimensec = sx->atime.tv_nsec; + attr->mtimensec = sx->mtime.tv_nsec; + attr->ctimensec = sx->ctime.tv_nsec; + attr->mode = sx->mode; + attr->nlink = sx->nlink; + attr->uid = sx->uid; + attr->gid = sx->gid; + attr->rdev = new_encode_dev(MKDEV(sx->rdev_major, sx->rdev_minor)); + attr->blksize = sx->blksize; +} + +static int fuse_do_statx(struct inode *inode, struct file *file, + struct kstat *stat) +{ + int err; + struct fuse_attr attr; + struct fuse_statx *sx; + struct fuse_statx_in inarg; + struct fuse_statx_out outarg; + struct fuse_mount *fm = get_fuse_mount(inode); + u64 attr_version = fuse_get_attr_version(fm->fc); + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + memset(&outarg, 0, sizeof(outarg)); + /* Directories have separate file-handle space */ + if (file && S_ISREG(inode->i_mode)) { + struct fuse_file *ff = file->private_data; + + inarg.getattr_flags |= FUSE_GETATTR_FH; + inarg.fh = ff->fh; + } + /* For now leave sync hints as the default, request all stats. */ + inarg.sx_flags = 0; + inarg.sx_mask = STATX_BASIC_STATS | STATX_BTIME; + args.opcode = FUSE_STATX; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + err = fuse_simple_request(fm, &args); + if (err) + return err; + + sx = &outarg.stat; + if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || + ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || + inode_wrong_type(inode, sx->mode)))) { + make_bad_inode(inode); + return -EIO; + } + + fuse_statx_to_attr(&outarg.stat, &attr); + if ((sx->mask & STATX_BASIC_STATS) == STATX_BASIC_STATS) { + fuse_change_attributes(inode, &attr, &outarg.stat, + ATTR_TIMEOUT(&outarg), attr_version); + } + + if (stat) { + stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); + stat->btime.tv_sec = sx->btime.tv_sec; + stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); + fuse_fillattr(inode, &attr, stat); + stat->result_mask |= STATX_TYPE; + } + + return 0; +} + static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct file *file) { @@ -1189,8 +1267,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, fuse_make_bad(inode); err = -EIO; } else { - fuse_change_attributes(inode, &outarg.attr, - attr_timeout(&outarg), + fuse_change_attributes(inode, &outarg.attr, NULL, + ATTR_TIMEOUT(&outarg), attr_version); if (stat) fuse_fillattr(inode, &outarg.attr, stat); @@ -1204,12 +1282,22 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); int err = 0; bool sync; u32 inval_mask = READ_ONCE(fi->inval_mask); u32 cache_mask = fuse_get_cache_mask(inode); - if (flags & AT_STATX_FORCE_SYNC) + + /* FUSE only supports basic stats and possibly btime */ + request_mask &= STATX_BASIC_STATS | STATX_BTIME; +retry: + if (fc->no_statx) + request_mask &= STATX_BASIC_STATS; + + if (!request_mask) + sync = false; + else if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; @@ -1220,11 +1308,24 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, if (sync) { forget_all_cached_acls(inode); - err = fuse_do_getattr(inode, stat, file); + /* Try statx if BTIME is requested */ + if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { + err = fuse_do_statx(inode, file, stat); + if (err == -ENOSYS) { + fc->no_statx = 1; + goto retry; + } + } else { + err = fuse_do_getattr(inode, stat, file); + } } else if (stat) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; + if (test_bit(FUSE_I_BTIME, &fi->state)) { + stat->btime = fi->i_btime; + stat->result_mask |= STATX_BTIME; + } } return err; @@ -1711,12 +1812,12 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) memset(&outarg, 0, sizeof(outarg)); inarg.valid = FATTR_MTIME; - inarg.mtime = inode->i_mtime.tv_sec; - inarg.mtimensec = inode->i_mtime.tv_nsec; + inarg.mtime = inode_get_mtime_sec(inode); + inarg.mtimensec = inode_get_mtime_nsec(inode); if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; - inarg.ctime = inode->i_ctime.tv_sec; - inarg.ctimensec = inode->i_ctime.tv_nsec; + inarg.ctime = inode_get_ctime_sec(inode); + inarg.ctimensec = inode_get_ctime_nsec(inode); } if (ff) { inarg.valid |= FATTR_FH; @@ -1855,14 +1956,14 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); /* FIXME: clear I_DIRTY_SYNC? */ } - fuse_change_attributes_common(inode, &outarg.attr, - attr_timeout(&outarg), + fuse_change_attributes_common(inode, &outarg.attr, NULL, + ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bc4115288eec..a660f1f21540 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,7 +19,6 @@ #include <linux/uio.h> #include <linux/fs.h> #include <linux/filelock.h> -#include <linux/file.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -479,36 +478,48 @@ static void fuse_sync_writes(struct inode *inode) fuse_release_nowrite(inode); } -struct fuse_flush_args { - struct fuse_args args; - struct fuse_flush_in inarg; - struct work_struct work; - struct file *file; -}; - -static int fuse_do_flush(struct fuse_flush_args *fa) +static int fuse_flush(struct file *file, fl_owner_t id) { - int err; - struct inode *inode = file_inode(fa->file); + struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_file *ff = file->private_data; + struct fuse_flush_in inarg; + FUSE_ARGS(args); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) + return 0; err = write_inode_now(inode, 1); if (err) - goto out; + return err; inode_lock(inode); fuse_sync_writes(inode); inode_unlock(inode); - err = filemap_check_errors(fa->file->f_mapping); + err = filemap_check_errors(file->f_mapping); if (err) - goto out; + return err; err = 0; if (fm->fc->no_flush) goto inval_attr_out; - err = fuse_simple_request(fm, &fa->args); + memset(&inarg, 0, sizeof(inarg)); + inarg.fh = ff->fh; + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; @@ -521,57 +532,9 @@ inval_attr_out: */ if (!err && fm->fc->writeback_cache) fuse_invalidate_attr_mask(inode, STATX_BLOCKS); - -out: - fput(fa->file); - kfree(fa); return err; } -static void fuse_flush_async(struct work_struct *work) -{ - struct fuse_flush_args *fa = container_of(work, typeof(*fa), work); - - fuse_do_flush(fa); -} - -static int fuse_flush(struct file *file, fl_owner_t id) -{ - struct fuse_flush_args *fa; - struct inode *inode = file_inode(file); - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_file *ff = file->private_data; - - if (fuse_is_bad(inode)) - return -EIO; - - if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) - return 0; - - fa = kzalloc(sizeof(*fa), GFP_KERNEL); - if (!fa) - return -ENOMEM; - - fa->inarg.fh = ff->fh; - fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); - fa->args.opcode = FUSE_FLUSH; - fa->args.nodeid = get_node_id(inode); - fa->args.in_numargs = 1; - fa->args.in_args[0].size = sizeof(fa->inarg); - fa->args.in_args[0].value = &fa->inarg; - fa->args.force = true; - fa->file = get_file(file); - - /* Don't wait if the task is exiting */ - if (current->flags & PF_EXITING) { - INIT_WORK(&fa->work, fuse_flush_async); - schedule_work(&fa->work); - return 0; - } - - return fuse_do_flush(fa); -} - int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { @@ -1465,7 +1428,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; @@ -1477,12 +1441,20 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int err = 0; struct fuse_io_args *ia; unsigned int max_pages; + bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; max_pages = iov_iter_npages(iter, fc->max_pages); ia = fuse_io_alloc(io, max_pages); if (!ia) return -ENOMEM; + if (fopen_direct_io && fc->direct_io_allow_mmap) { + res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); + if (res) { + fuse_io_free(ia); + return res; + } + } if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1491,6 +1463,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } + if (fopen_direct_io && write) { + res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); + if (res) { + fuse_io_free(ia); + return res; + } + } + io->should_dirty = !write && user_backed_iter(iter); while (count) { ssize_t nres; @@ -1594,6 +1574,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t res; bool exclusive_lock = !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || + get_fuse_conn(inode)->direct_io_allow_mmap || iocb->ki_flags & IOCB_APPEND || fuse_direct_write_extending_i_size(iocb, from); @@ -1601,6 +1582,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) * Take exclusive lock if * - Parallel direct writes are disabled - a user space decision * - Parallel direct writes are enabled and i_size is being extended. + * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). * This might not be needed at all, but needs further investigation. */ if (exclusive_lock) @@ -2478,14 +2460,17 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; /* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_DAX(file_inode(file))) return fuse_dax_mmap(file, vma); if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED */ - if (vma->vm_flags & VM_MAYSHARE) + /* Can't provide the coherency needed for MAP_SHARED + * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. + */ + if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) return -ENODEV; invalidate_inode_pages2(file->f_mapping); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 9b7fc7d3c7f1..1df83eebda92 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -63,6 +63,19 @@ struct fuse_forget_link { struct fuse_forget_link *next; }; +/* Submount lookup tracking */ +struct fuse_submount_lookup { + /** Refcount */ + refcount_t count; + + /** Unique ID, which identifies the inode between userspace + * and kernel */ + u64 nodeid; + + /** The request used for sending the FORGET message */ + struct fuse_forget_link *forget; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -88,6 +101,9 @@ struct fuse_inode { preserve the original mode */ umode_t orig_i_mode; + /* Cache birthtime */ + struct timespec64 i_btime; + /** 64 bit inode number */ u64 orig_ino; @@ -155,6 +171,8 @@ struct fuse_inode { */ struct fuse_inode_dax *dax; #endif + /** Submount specific lookup tracking */ + struct fuse_submount_lookup *submount_lookup; }; /** FUSE inode state bits */ @@ -167,6 +185,8 @@ enum { FUSE_I_SIZE_UNSTABLE, /* Bad inode */ FUSE_I_BAD, + /* Has btime */ + FUSE_I_BTIME, }; struct fuse_conn; @@ -792,6 +812,12 @@ struct fuse_conn { /* Is tmpfile not implemented by fs? */ unsigned int no_tmpfile:1; + /* Relax restrictions to allow shared mmap in FOPEN_DIRECT_IO mode */ + unsigned int direct_io_allow_mmap:1; + + /* Is statx not implemented by fs? */ + unsigned int no_statx:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -1058,9 +1084,11 @@ void fuse_init_symlink(struct inode *inode); * Change attributes of an inode */ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u64 attr_version); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u32 cache_mask); u32 fuse_get_cache_mask(struct inode *inode); @@ -1111,7 +1139,10 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); -u64 entry_attr_timeout(struct fuse_entry_out *o); +u64 fuse_time_to_jiffies(u64 sec, u32 nsec); +#define ATTR_TIMEOUT(o) \ + fuse_time_to_jiffies((o)->attr_valid, (o)->attr_valid_nsec) + void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); /** @@ -1268,7 +1299,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); -extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler * const fuse_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f19d748890f0..2a6d44f91729 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -68,6 +68,24 @@ struct fuse_forget_link *fuse_alloc_forget(void) return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } +static struct fuse_submount_lookup *fuse_alloc_submount_lookup(void) +{ + struct fuse_submount_lookup *sl; + + sl = kzalloc(sizeof(struct fuse_submount_lookup), GFP_KERNEL_ACCOUNT); + if (!sl) + return NULL; + sl->forget = fuse_alloc_forget(); + if (!sl->forget) + goto out_free; + + return sl; + +out_free: + kfree(sl); + return NULL; +} + static struct inode *fuse_alloc_inode(struct super_block *sb) { struct fuse_inode *fi; @@ -77,12 +95,13 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) return NULL; fi->i_time = 0; - fi->inval_mask = 0; + fi->inval_mask = ~0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; fi->orig_ino = 0; fi->state = 0; + fi->submount_lookup = NULL; mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); @@ -113,6 +132,17 @@ static void fuse_free_inode(struct inode *inode) kmem_cache_free(fuse_inode_cachep, fi); } +static void fuse_cleanup_submount_lookup(struct fuse_conn *fc, + struct fuse_submount_lookup *sl) +{ + if (!refcount_dec_and_test(&sl->count)) + return; + + fuse_queue_forget(fc, sl->forget, sl->nodeid, 1); + sl->forget = NULL; + kfree(sl); +} + static void fuse_evict_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -132,6 +162,11 @@ static void fuse_evict_inode(struct inode *inode) fi->nlookup); fi->forget = NULL; } + + if (fi->submount_lookup) { + fuse_cleanup_submount_lookup(fc, fi->submount_lookup); + fi->submount_lookup = NULL; + } } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(!list_empty(&fi->write_files)); @@ -163,6 +198,7 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u32 cache_mask) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -172,7 +208,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - WRITE_ONCE(fi->inval_mask, 0); + /* Clear basic stats from invalid mask */ + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -186,16 +223,32 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); - inode->i_atime.tv_sec = attr->atime; - inode->i_atime.tv_nsec = attr->atimensec; + inode_set_atime(inode, attr->atime, attr->atimensec); /* mtime from server may be stale due to local buffered write */ if (!(cache_mask & STATX_MTIME)) { - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; + inode_set_mtime(inode, attr->mtime, attr->mtimensec); } if (!(cache_mask & STATX_CTIME)) { - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + inode_set_ctime(inode, attr->ctime, attr->ctimensec); + } + if (sx) { + /* Sanitize nsecs */ + sx->btime.tv_nsec = + min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); + + /* + * Btime has been queried, cache is valid (whether or not btime + * is available or not) so clear STATX_BTIME from inval_mask. + * + * Availability of the btime attribute is indicated in + * FUSE_I_BTIME + */ + set_mask_bits(&fi->inval_mask, STATX_BTIME, 0); + if (sx->mask & STATX_BTIME) { + set_bit(FUSE_I_BTIME, &fi->state); + fi->i_btime.tv_sec = sx->btime.tv_sec; + fi->i_btime.tv_nsec = sx->btime.tv_nsec; + } } if (attr->blksize != 0) @@ -236,6 +289,7 @@ u32 fuse_get_cache_mask(struct inode *inode) } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, u64 attr_version) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -255,12 +309,12 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); if (cache_mask & STATX_MTIME) { - attr->mtime = inode->i_mtime.tv_sec; - attr->mtimensec = inode->i_mtime.tv_nsec; + attr->mtime = inode_get_mtime_sec(inode); + attr->mtimensec = inode_get_mtime_nsec(inode); } if (cache_mask & STATX_CTIME) { - attr->ctime = inode->i_ctime.tv_sec; - attr->ctimensec = inode->i_ctime.tv_nsec; + attr->ctime = inode_get_ctime_sec(inode); + attr->ctimensec = inode_get_ctime_nsec(inode); } if ((attr_version != 0 && fi->attr_version > attr_version) || @@ -269,8 +323,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } - old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid, cache_mask); + old_mtime = inode_get_mtime(inode); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); oldsize = inode->i_size; /* @@ -311,15 +365,20 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } +static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, + u64 nodeid) +{ + sl->nodeid = nodeid; + refcount_set(&sl->count, 1); +} + static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc) { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + inode_set_mtime(inode, attr->mtime, attr->mtimensec); + inode_set_ctime(inode, attr->ctime, attr->ctimensec); if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode, attr->flags); @@ -375,12 +434,22 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, */ if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && S_ISDIR(attr->mode)) { + struct fuse_inode *fi; + inode = new_inode(sb); if (!inode) return NULL; fuse_init_inode(inode, attr, fc); - get_fuse_inode(inode)->nodeid = nodeid; + fi = get_fuse_inode(inode); + fi->nodeid = nodeid; + fi->submount_lookup = fuse_alloc_submount_lookup(); + if (!fi->submount_lookup) { + iput(inode); + return NULL; + } + /* Sets nlookup = 1 on fi->submount_lookup->nlookup */ + fuse_init_submount_lookup(fi->submount_lookup, nodeid); inode->i_flags |= S_AUTOMOUNT; goto done; } @@ -403,12 +472,12 @@ retry: iput(inode); goto retry; } -done: fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); - fuse_change_attributes(inode, attr, attr_valid, attr_version); +done: + fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); return inode; } @@ -982,7 +1051,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, } *max_len = len; - return parent ? 0x82 : 0x81; + return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *fuse_fh_to_dentry(struct super_block *sb, @@ -990,7 +1059,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb, { struct fuse_inode_handle handle; - if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + if ((fh_type != FILEID_INO64_GEN && + fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) return NULL; handle.nodeid = (u64) fid->raw[0] << 32; @@ -1004,7 +1074,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb, { struct fuse_inode_handle parent; - if (fh_type != 0x82 || fh_len < 6) + if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) return NULL; parent.nodeid = (u64) fid->raw[3] << 32; @@ -1212,6 +1282,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->init_security = 1; if (flags & FUSE_CREATE_SUPP_GROUP) fc->create_supp_group = 1; + if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) + fc->direct_io_allow_mmap = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1258,7 +1330,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1401,16 +1473,20 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); static void fuse_fill_attr_from_inode(struct fuse_attr *attr, const struct fuse_inode *fi) { + struct timespec64 atime = inode_get_atime(&fi->inode); + struct timespec64 mtime = inode_get_mtime(&fi->inode); + struct timespec64 ctime = inode_get_ctime(&fi->inode); + *attr = (struct fuse_attr){ .ino = fi->inode.i_ino, .size = fi->inode.i_size, .blocks = fi->inode.i_blocks, - .atime = fi->inode.i_atime.tv_sec, - .mtime = fi->inode.i_mtime.tv_sec, - .ctime = fi->inode.i_ctime.tv_sec, - .atimensec = fi->inode.i_atime.tv_nsec, - .mtimensec = fi->inode.i_mtime.tv_nsec, - .ctimensec = fi->inode.i_ctime.tv_nsec, + .atime = atime.tv_sec, + .mtime = mtime.tv_sec, + .ctime = ctime.tv_sec, + .atimensec = atime.tv_nsec, + .mtimensec = mtime.tv_nsec, + .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, .uid = fi->inode.i_uid.val, @@ -1441,6 +1517,8 @@ static int fuse_fill_super_submount(struct super_block *sb, struct super_block *parent_sb = parent_fi->inode.i_sb; struct fuse_attr root_attr; struct inode *root; + struct fuse_submount_lookup *sl; + struct fuse_inode *fi; fuse_sb_defaults(sb); fm->sb = sb; @@ -1463,12 +1541,27 @@ static int fuse_fill_super_submount(struct super_block *sb, * its nlookup should not be incremented. fuse_iget() does * that, though, so undo it here. */ - get_fuse_inode(root)->nlookup--; + fi = get_fuse_inode(root); + fi->nlookup--; + sb->s_d_op = &fuse_dentry_operations; sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; + /* + * Grab the parent's submount_lookup pointer and take a + * reference on the shared nlookup from the parent. This is to + * prevent the last forget for this nodeid from getting + * triggered until all users have finished with it. + */ + sl = parent_fi->submount_lookup; + WARN_ON(!sl); + if (sl) { + refcount_inc(&sl->count); + fi->submount_lookup = sl; + } + return 0; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index dc603479b30e..c66a54d6c7d3 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -223,8 +223,8 @@ retry: spin_unlock(&fi->lock); forget_all_cached_acls(inode); - fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), + fuse_change_attributes(inode, &o->attr, NULL, + ATTR_TIMEOUT(o), attr_version); /* * The other branch comes via fuse_iget() @@ -232,7 +232,7 @@ retry: */ } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), + &o->attr, ATTR_TIMEOUT(o), attr_version); if (!inode) inode = ERR_PTR(-ENOMEM); @@ -243,8 +243,16 @@ retry: dput(dentry); dentry = alias; } - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + if (!IS_ERR(inode)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->nlookup--; + spin_unlock(&fi->lock); + } return PTR_ERR(dentry); + } } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); @@ -468,7 +476,7 @@ retry_locked: if (!fi->rdc.cached) { /* Starting cache? Set cache mtime. */ if (!ctx->pos && !fi->rdc.size) { - fi->rdc.mtime = inode->i_mtime; + fi->rdc.mtime = inode_get_mtime(inode); fi->rdc.iversion = inode_query_iversion(inode); } spin_unlock(&fi->rdc.lock); @@ -480,8 +488,10 @@ retry_locked: * changed, and reset the cache if so. */ if (!ctx->pos) { + struct timespec64 mtime = inode_get_mtime(inode); + if (inode_peek_iversion(inode) != fi->rdc.iversion || - !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + !timespec64_equal(&fi->rdc.mtime, &mtime)) { fuse_rdc_reset(inode); goto retry_locked; } diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 49c01559580f..5b423fdbb13f 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -209,7 +209,7 @@ static const struct xattr_handler fuse_xattr_handler = { .set = fuse_xattr_set, }; -const struct xattr_handler *fuse_xattr_handlers[] = { +const struct xattr_handler * const fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 03c966840422..be7f87a8e11a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config GFS2_FS tristate "GFS2 file system support" + select BUFFER_HEAD select FS_POSIX_ACL select CRC32 select LIBCRC32C diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index a392aa0f041d..443640e6fb9c 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -142,7 +142,7 @@ int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, ret = __gfs2_set_acl(inode, acl, type); if (!ret && mode != inode->i_mode) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode->i_mode = mode; mark_inode_dirty(inode); } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index d4deb2b19959..82f5b09c04e6 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -11,9 +11,9 @@ #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) -extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); -extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, - struct posix_acl *acl, int type); +struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); +int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ae49256b7c8c..9611bfceda4b 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -130,7 +130,7 @@ static int __gfs2_jdata_write_folio(struct folio *folio, if (folio_test_checked(folio)) { folio_clear_checked(folio); if (!folio_buffers(folio)) { - folio_create_empty_buffers(folio, + create_empty_buffers(folio, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } @@ -155,7 +155,7 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE)) goto out; if (folio_test_checked(folio) || current->journal_info) goto out_ignore; @@ -183,13 +183,13 @@ static int gfs2_writepages(struct address_space *mapping, int ret; /* - * Even if we didn't write any pages here, we might still be holding + * Even if we didn't write enough pages here, we might still be holding * dirty pages in the ail. We forcibly flush the ail because we don't * want balance_dirty_pages() to loop indefinitely trying to write out * pages held in the ail that it can't find. */ ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); - if (ret == 0) + if (ret == 0 && wbc->nr_to_write > 0) set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); return ret; } @@ -214,12 +214,12 @@ static int gfs2_write_jdata_batch(struct address_space *mapping, unsigned nrblocks; int i; int ret; - int nr_pages = 0; + size_t size = 0; int nr_folios = folio_batch_count(fbatch); for (i = 0; i < nr_folios; i++) - nr_pages += folio_nr_pages(fbatch->folios[i]); - nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); + size += folio_size(fbatch->folios[i]); + nrblocks = size >> inode->i_blkbits; ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) @@ -272,8 +272,7 @@ continue_unlock: * not be suitable for data integrity * writeout). */ - *done_index = folio->index + - folio_nr_pages(folio); + *done_index = folio_next_index(folio); ret = 1; break; } @@ -404,27 +403,27 @@ static int gfs2_jdata_writepages(struct address_space *mapping, } /** - * stuffed_readpage - Fill in a Linux page with stuffed file data + * stuffed_readpage - Fill in a Linux folio with stuffed file data * @ip: the inode - * @page: the page + * @folio: the folio * * Returns: errno */ -static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *dibh; - u64 dsize = i_size_read(&ip->i_inode); - void *kaddr; + size_t i_size = i_size_read(&ip->i_inode); + void *data; int error; /* * Due to the order of unstuffing files and ->fault(), we can be - * asked for a zero page in the case of a stuffed file being extended, + * asked for a zero folio in the case of a stuffed file being extended, * so we need to supply one here. It doesn't happen often. */ - if (unlikely(page->index)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (unlikely(folio->index)) { + folio_zero_range(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); return 0; } @@ -432,13 +431,11 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) if (error) return error; - kaddr = kmap_local_page(page); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); - memset(kaddr + dsize, 0, PAGE_SIZE - dsize); - kunmap_local(kaddr); - flush_dcache_page(page); + data = dibh->b_data + sizeof(struct gfs2_dinode); + memcpy_to_folio(folio, 0, data, i_size); + folio_zero_range(folio, i_size, folio_size(folio) - i_size); brelse(dibh); - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } @@ -459,7 +456,7 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { error = iomap_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { - error = stuffed_readpage(ip, &folio->page); + error = stuffed_readpage(ip, folio); folio_unlock(folio); } else { error = mpage_read_folio(folio, gfs2_block_map); @@ -480,31 +477,29 @@ static int gfs2_read_folio(struct file *file, struct folio *folio) * */ -int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, - unsigned size) +ssize_t gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, + size_t size) { struct address_space *mapping = ip->i_inode.i_mapping; unsigned long index = *pos >> PAGE_SHIFT; - unsigned offset = *pos & (PAGE_SIZE - 1); - unsigned copied = 0; - unsigned amt; - struct page *page; + size_t copied = 0; do { - page = read_cache_page(mapping, index, gfs2_read_folio, NULL); - if (IS_ERR(page)) { - if (PTR_ERR(page) == -EINTR) + size_t offset, chunk; + struct folio *folio; + + folio = read_cache_folio(mapping, index, gfs2_read_folio, NULL); + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -EINTR) continue; - return PTR_ERR(page); + return PTR_ERR(folio); } - amt = size - copied; - if (offset + size > PAGE_SIZE) - amt = PAGE_SIZE - offset; - memcpy_from_page(buf + copied, page, offset, amt); - put_page(page); - copied += amt; - index++; - offset = 0; + offset = *pos + copied - folio_pos(folio); + chunk = min(size - copied, folio_size(folio) - offset); + memcpy_from_folio(buf + copied, folio, offset, chunk); + index = folio_next_index(folio); + folio_put(folio); + copied += chunk; } while(copied < size); (*pos) += size; return size; @@ -747,7 +742,7 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .read_folio = gfs2_read_folio, .readahead = gfs2_readahead, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h index f08322ef41cf..a10c4334d248 100644 --- a/fs/gfs2/aops.h +++ b/fs/gfs2/aops.h @@ -8,8 +8,8 @@ #include "incore.h" -extern void adjust_fs_space(struct inode *inode); -extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - size_t from, size_t len); +void adjust_fs_space(struct inode *inode); +void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + size_t from, size_t len); #endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8d611fbcf0bd..d9ccfd27e4f1 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -43,53 +43,51 @@ struct metapath { static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length); /** - * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated - * @page: The (optional) page. This is looked up if @page is NULL + * @folio: The folio. * * Returns: errno */ - -static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, - u64 block, struct page *page) +static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh, + u64 block, struct folio *folio) { struct inode *inode = &ip->i_inode; - if (!PageUptodate(page)) { - void *kaddr = kmap(page); + if (!folio_test_uptodate(folio)) { + void *kaddr = kmap_local_folio(folio, 0); u64 dsize = i_size_read(inode); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); - memset(kaddr + dsize, 0, PAGE_SIZE - dsize); - kunmap(page); + memset(kaddr + dsize, 0, folio_size(folio) - dsize); + kunmap_local(kaddr); - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (gfs2_is_jdata(ip)) { - struct buffer_head *bh; + struct buffer_head *bh = folio_buffers(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, BIT(inode->i_blkbits), - BIT(BH_Uptodate)); + if (!bh) + bh = create_empty_buffers(folio, + BIT(inode->i_blkbits), BIT(BH_Uptodate)); - bh = page_buffers(page); if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } else { - set_page_dirty(page); + folio_mark_dirty(folio); gfs2_ordered_add_inode(ip); } return 0; } -static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) +static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *bh, *dibh; struct gfs2_dinode *di; @@ -106,7 +104,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) and write it out to disk */ unsigned int n = 1; - error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) goto out_brelse; if (isdir) { @@ -118,7 +116,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) dibh, sizeof(struct gfs2_dinode)); brelse(bh); } else { - error = gfs2_unstuffer_page(ip, dibh, block, page); + error = gfs2_unstuffer_folio(ip, dibh, block, folio); if (error) goto out_brelse; } @@ -157,17 +155,17 @@ out_brelse: int gfs2_unstuff_dinode(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; - struct page *page; + struct folio *folio; int error; down_write(&ip->i_rw_mutex); - page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); - error = -ENOMEM; - if (!page) + folio = filemap_grab_folio(inode->i_mapping, 0); + error = PTR_ERR(folio); + if (IS_ERR(folio)) goto out; - error = __gfs2_unstuff_inode(ip, page); - unlock_page(page); - put_page(page); + error = __gfs2_unstuff_inode(ip, folio); + folio_unlock(folio); + folio_put(folio); out: up_write(&ip->i_rw_mutex); return error; @@ -317,6 +315,12 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) } } +static inline struct buffer_head * +metapath_dibh(struct metapath *mp) +{ + return mp->mp_bh[0]; +} + static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, unsigned int x, unsigned int h) { @@ -415,13 +419,12 @@ static void release_metapath(struct metapath *mp) * gfs2_extent_length - Returns length of an extent of blocks * @bh: The metadata block * @ptr: Current position in @bh - * @limit: Max extent length to return * @eob: Set to 1 if we hit "end of block" * * Returns: The length of the extent (minimum of one block) */ -static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob) +static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, int *eob) { const __be64 *end = (__be64 *)(bh->b_data + bh->b_size); const __be64 *first = ptr; @@ -660,7 +663,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *dibh = mp->mp_bh[0]; + struct buffer_head *dibh = metapath_dibh(mp); u64 bn; unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; size_t dblks = iomap->length >> inode->i_blkbits; @@ -702,7 +705,7 @@ static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, i = mp->mp_aheight; do { n = blks - alloced; - ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + ret = gfs2_alloc_blocks(ip, &bn, &n, 0); if (ret) goto out; alloced += n; @@ -913,7 +916,7 @@ unstuff: goto do_alloc; bh = mp->mp_bh[ip->i_height - 1]; - len = gfs2_extent_length(bh, ptr, len, &eob); + len = gfs2_extent_length(bh, ptr, &eob); iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; iomap->length = len << inode->i_blkbits; @@ -971,7 +974,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) if (status) return ERR_PTR(status); - folio = iomap_get_folio(iter, pos); + folio = iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) gfs2_trans_end(sdp); return folio; @@ -1386,7 +1389,7 @@ static int trunc_start(struct inode *inode, u64 newsize) ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; i_size_write(inode, newsize); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_dinode_out(ip, dibh->b_data); if (journaled) @@ -1583,8 +1586,7 @@ out_unlock: /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ - ip->i_inode.i_mtime = ip->i_inode.i_ctime = - current_time(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1950,7 +1952,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) gfs2_statfs_change(sdp, 0, +btotal, 0); gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); @@ -1993,7 +1995,7 @@ static int trunc_end(struct gfs2_inode *ip) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); gfs2_ordered_del_inode(ip); } - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_meta(ip->i_gl, dibh); @@ -2094,7 +2096,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_end_trans; truncate_setsize(inode, size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index e5b7d17131ed..4e8b1e8ebdf3 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -46,24 +46,24 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, extern const struct iomap_ops gfs2_iomap_ops; extern const struct iomap_writeback_ops gfs2_writeback_ops; -extern int gfs2_unstuff_dinode(struct gfs2_inode *ip); -extern int gfs2_block_map(struct inode *inode, sector_t lblock, - struct buffer_head *bh, int create); -extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, - struct iomap *iomap); -extern int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, - struct iomap *iomap); -extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, - unsigned int *extlen); -extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, - unsigned *extlen, bool *new); -extern int gfs2_setattr_size(struct inode *inode, u64 size); -extern int gfs2_truncatei_resume(struct gfs2_inode *ip); -extern int gfs2_file_dealloc(struct gfs2_inode *ip); -extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, - unsigned int len); -extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); -extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); -extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length); +int gfs2_unstuff_dinode(struct gfs2_inode *ip); +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen); +int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned *extlen, bool *new); +int gfs2_setattr_size(struct inode *inode, u64 size); +int gfs2_truncatei_resume(struct gfs2_inode *ip); +int gfs2_file_dealloc(struct gfs2_inode *ip); +int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); +int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); +void gfs2_free_journal_extents(struct gfs2_jdesc *jd); +int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length); #endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 54a6d17b8c25..560e4624c09f 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -227,7 +227,7 @@ out: if (ip->i_inode.i_size < offset + copied) i_size_write(&ip->i_inode, offset + copied); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); @@ -868,7 +868,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_dirent *dent; struct timespec64 tv = current_time(inode); - error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &bn, &n, 0); if (error) return NULL; bh = gfs2_meta_new(ip->i_gl, bn); @@ -1814,7 +1814,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); - tv = current_time(&ip->i_inode); + tv = inode_set_ctime_current(&ip->i_inode); if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); @@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; + inode_set_mtime_to_ts(&ip->i_inode, tv); if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1876,7 +1876,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - struct timespec64 tv = current_time(&dip->i_inode); + struct timespec64 tv; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1896,6 +1896,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) } dirent_del(dip, bh, prev, dent); + tv = inode_set_ctime_current(&dip->i_inode); if (dip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; u16 entries = be16_to_cpu(leaf->lf_entries); @@ -1910,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; + inode_set_mtime_to_ts(&dip->i_inode, tv); if (d_is_dir(dentry)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -1951,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, dent->de_type = cpu_to_be16(new_type); brelse(bh); - dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode); + inode_set_mtime_to_ts(&dip->i_inode, inode_set_ctime_current(&dip->i_inode)); mark_inode_dirty_sync(&dip->i_inode); return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 5b76480c17c9..25a857c78b53 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -23,32 +23,32 @@ struct gfs2_diradd { int save_loc; }; -extern struct inode *gfs2_dir_search(struct inode *dir, - const struct qstr *filename, - bool fail_on_exist); -extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, - const struct gfs2_inode *ip); -extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip, struct gfs2_diradd *da); +struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename, + bool fail_on_exist); +int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip, struct gfs2_diradd *da); static inline void gfs2_dir_no_add(struct gfs2_diradd *da) { brelse(da->bh); da->bh = NULL; } -extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); -extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, - struct file_ra_state *f_ra); -extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, - const struct gfs2_inode *nip, unsigned int new_type); +int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); +int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra); +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); -extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); -extern int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename, - struct gfs2_diradd *da); -extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, - struct buffer_head **bhp); -extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); +int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename, + struct gfs2_diradd *da); +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); +void gfs2_dir_hash_inval(struct gfs2_inode *ip); static inline u32 gfs2_disk_hash(const char *data, int len) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index b43fa8b8fc05..4b66efc1a82a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -260,7 +260,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); @@ -418,7 +418,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; u64 offset = page_offset(page); unsigned int data_blocks, ind_blocks, rblocks; vm_fault_t ret = VM_FAULT_LOCKED; @@ -432,7 +432,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } @@ -474,7 +474,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_rindex_update(sdp); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } @@ -482,12 +482,12 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) ap.target = data_blocks + ind_blocks; err = gfs2_quota_lock_check(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } err = gfs2_inplace_reserve(ip, &ap); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_quota_unlock; } @@ -500,7 +500,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) } err = gfs2_trans_begin(sdp, rblocks, 0); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_fail; } @@ -508,7 +508,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (gfs2_is_stuffed(ip)) { err = gfs2_unstuff_dinode(ip); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_trans_end; } } @@ -524,7 +524,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) err = gfs2_allocate_page_backing(page, length); if (err) - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); out_page_locked: if (ret != VM_FAULT_LOCKED) @@ -558,7 +558,7 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf) gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); err = gfs2_glock_nq(&gh); if (err) { - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_uninit; } ret = filemap_fault(vmf); @@ -1120,14 +1120,16 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out_unlock; - ret = file_update_time(file); - if (ret) - goto out_unlock; - if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; ssize_t buffered, ret2; + /* + * Note that under direct I/O, we don't allow and inode + * timestamp updates, so we're not calling file_update_time() + * here. + */ + ret = gfs2_file_direct_write(iocb, from, &gh); if (ret < 0 || !iov_iter_count(from)) goto out_unlock; @@ -1154,6 +1156,10 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!ret || ret2 > 0) ret += ret2; } else { + ret = file_update_time(file); + if (ret) + goto out_unlock; + ret = gfs2_file_buffered_write(iocb, from, &gh); if (likely(ret > 0)) ret = generic_write_sync(iocb, ret); @@ -1245,7 +1251,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; loff_t bytes, max_bytes, max_blks; int error; @@ -1436,17 +1442,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (cmd == F_CANCELLK) { - /* Hack: */ - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } if (unlikely(gfs2_withdrawn(sdp))) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; } - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (fl->fl_type == F_UNLCK) return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1438e7465e30..d6bf1f8c25dc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -176,7 +176,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + wake_up(&sdp->sd_kill_wait); } /** @@ -468,10 +468,10 @@ done: * do_promote - promote as many requests as possible on the current queue * @gl: The glock * - * Returns: 1 if there is a blocked holder at the head of the list + * Returns true on success (i.e., progress was made or there are no waiters). */ -static int do_promote(struct gfs2_glock *gl) +static bool do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; @@ -484,10 +484,10 @@ static int do_promote(struct gfs2_glock *gl) * If we get here, it means we may not grant this * holder for some reason. If this holder is at the * head of the list, it means we have a blocked holder - * at the head, so return 1. + * at the head, so return false. */ if (list_is_first(&gh->gh_list, &gl->gl_holders)) - return 1; + return false; do_error(gl, 0); break; } @@ -497,7 +497,7 @@ static int do_promote(struct gfs2_glock *gl) if (!current_gh) current_gh = gh; } - return 0; + return true; } /** @@ -591,10 +591,11 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { - if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0) - list_move_tail(&gh->gh_list, &gl->gl_holders); + list_move_tail(&gh->gh_list, &gl->gl_holders); gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; + if (do_promote(gl)) + goto out; goto retry; } /* Some error or failed "try lock" - report it */ @@ -679,8 +680,7 @@ __acquires(&gl->gl_lockref.lock) gh && !(gh->gh_flags & LM_FLAG_NOEXP)) goto skip_inval; - lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | - LM_FLAG_PRIORITY); + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && @@ -834,7 +834,7 @@ __acquires(&gl->gl_lockref.lock) } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); - if (do_promote(gl) == 0) + if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; @@ -1022,7 +1022,7 @@ static void delete_work_func(struct work_struct *work) * step entirely. */ if (gfs2_try_evict(gl)) { - if (test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + if (test_bit(SDF_KILL, &sdp->sd_flags)) goto out; if (gfs2_queue_verify_evict(gl)) return; @@ -1035,7 +1035,7 @@ static void delete_work_func(struct work_struct *work) GFS2_BLKST_UNLINKED); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -EAGAIN && - !test_bit(SDF_DEACTIVATING, &sdp->sd_flags) && + !test_bit(SDF_KILL, &sdp->sd_flags) && gfs2_queue_verify_evict(gl)) return; } else { @@ -1231,7 +1231,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, out_free: gfs2_glock_dealloc(&gl->gl_rcu); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) - wake_up(&sdp->sd_glock_wait); + wake_up(&sdp->sd_kill_wait); out: return ret; @@ -1515,27 +1515,19 @@ fail: } if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) continue; - if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) - insert_pt = &gh2->gh_list; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); - if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) - goto do_cancel; return; } list_add_tail(&gh->gh_list, insert_pt); -do_cancel: - gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); - if (!(gh->gh_flags & LM_FLAG_PRIORITY)) { - spin_unlock(&gl->gl_lockref.lock); - if (sdp->sd_lockstruct.ls_ops->lm_cancel) - sdp->sd_lockstruct.ls_ops->lm_cancel(gl); - spin_lock(&gl->gl_lockref.lock); - } + spin_unlock(&gl->gl_lockref.lock); + if (sdp->sd_lockstruct.ls_ops->lm_cancel) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl); + spin_lock(&gl->gl_lockref.lock); return; trap_recursive: @@ -2017,7 +2009,9 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { if (!spin_trylock(&gl->gl_lockref.lock)) continue; - if (!gl->gl_lockref.count) { + if (gl->gl_lockref.count <= 1 && + (gl->gl_state == LM_ST_UNLOCKED || + demote_ok(gl))) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; @@ -2046,11 +2040,7 @@ static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(atomic_read(&lru_count)); } -static struct shrinker glock_shrinker = { - .seeks = DEFAULT_SEEKS, - .count_objects = gfs2_glock_shrink_count, - .scan_objects = gfs2_glock_shrink_scan, -}; +static struct shrinker *glock_shrinker; /** * glock_hash_walk - Call a function for glock in a hash bucket @@ -2195,7 +2185,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) flush_workqueue(glock_workqueue); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); - wait_event_timeout(sdp->sd_glock_wait, + wait_event_timeout(sdp->sd_kill_wait, atomic_read(&sdp->sd_glock_disposal) == 0, HZ * 600); glock_hash_walk(dump_glock_func, sdp); @@ -2227,8 +2217,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'e'; if (flags & LM_FLAG_ANY) *p++ = 'A'; - if (flags & LM_FLAG_PRIORITY) - *p++ = 'p'; if (flags & LM_FLAG_NODE_SCOPE) *p++ = 'n'; if (flags & GL_ASYNC) @@ -2472,13 +2460,18 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - ret = register_shrinker(&glock_shrinker, "gfs2-glock"); - if (ret) { + glock_shrinker = shrinker_alloc(0, "gfs2-glock"); + if (!glock_shrinker) { destroy_workqueue(glock_workqueue); rhashtable_destroy(&gl_hash_table); - return ret; + return -ENOMEM; } + glock_shrinker->count_objects = gfs2_glock_shrink_count; + glock_shrinker->scan_objects = gfs2_glock_shrink_scan; + + shrinker_register(glock_shrinker); + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) init_waitqueue_head(glock_wait_table + i); @@ -2487,7 +2480,7 @@ int __init gfs2_glock_init(void) void gfs2_glock_exit(void) { - unregister_shrinker(&glock_shrinker); + shrinker_free(glock_shrinker); rhashtable_destroy(&gl_hash_table); destroy_workqueue(glock_workqueue); } @@ -2726,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) for(;; i->fd++) { struct inode *inode; - i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } + inode = file_inode(i->file); - if (inode->i_sb != i->sb) - continue; - if (get_file_rcu(i->file)) + if (inode->i_sb == i->sb) break; + + rcu_read_unlock(); + fput(i->file); + rcu_read_lock(); } rcu_read_unlock(); return i->file; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 1f1ba92c15a8..61197598abfd 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -68,14 +68,6 @@ enum { * also be granted in SHARED. The preferred state is whichever is compatible * with other granted locks, or the specified state if no other locks exist. * - * LM_FLAG_PRIORITY - * Override fairness considerations. Suppose a lock is held in a shared state - * and there is a pending request for the deferred state. A shared lock - * request with the priority flag would be allowed to bypass the deferred - * request and directly join the other shared lock. A shared lock request - * without the priority flag might be forced to wait until the deferred - * requested had acquired and released the lock. - * * LM_FLAG_NODE_SCOPE * This holder agrees to share the lock within this node. In other words, * the glock is held in EX mode according to DLM, but local holders on the @@ -86,7 +78,6 @@ enum { #define LM_FLAG_TRY_1CB 0x0002 #define LM_FLAG_NOEXP 0x0004 #define LM_FLAG_ANY 0x0008 -#define LM_FLAG_PRIORITY 0x0010 #define LM_FLAG_NODE_SCOPE 0x0020 #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 @@ -165,21 +156,6 @@ out: return gh; } -static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) -{ - return gl->gl_state == LM_ST_EXCLUSIVE; -} - -static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl) -{ - return gl->gl_state == LM_ST_DEFERRED; -} - -static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) -{ - return gl->gl_state == LM_ST_SHARED; -} - static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) { if (gl->gl_ops->go_flags & GLOF_ASPACE) { @@ -190,40 +166,40 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) return NULL; } -extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - int create, struct gfs2_glock **glp); -extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl); -extern void gfs2_glock_put(struct gfs2_glock *gl); -extern void gfs2_glock_queue_put(struct gfs2_glock *gl); +int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl); +void gfs2_glock_put(struct gfs2_glock *gl); +void gfs2_glock_queue_put(struct gfs2_glock *gl); -extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, - u16 flags, struct gfs2_holder *gh, - unsigned long ip); +void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + u16 flags, struct gfs2_holder *gh, + unsigned long ip); static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh) { __gfs2_holder_init(gl, state, flags, gh, _RET_IP_); } -extern void gfs2_holder_reinit(unsigned int state, u16 flags, - struct gfs2_holder *gh); -extern void gfs2_holder_uninit(struct gfs2_holder *gh); -extern int gfs2_glock_nq(struct gfs2_holder *gh); -extern int gfs2_glock_poll(struct gfs2_holder *gh); -extern int gfs2_instantiate(struct gfs2_holder *gh); -extern int gfs2_glock_holder_ready(struct gfs2_holder *gh); -extern int gfs2_glock_wait(struct gfs2_holder *gh); -extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_glock_dq(struct gfs2_holder *gh); -extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); -extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); -extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, - const struct gfs2_glock_operations *glops, - unsigned int state, u16 flags, - struct gfs2_holder *gh); -extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, +void gfs2_holder_reinit(unsigned int state, u16 flags, + struct gfs2_holder *gh); +void gfs2_holder_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq(struct gfs2_holder *gh); +int gfs2_glock_poll(struct gfs2_holder *gh); +int gfs2_instantiate(struct gfs2_holder *gh); +int gfs2_glock_holder_ready(struct gfs2_holder *gh); +int gfs2_glock_wait(struct gfs2_holder *gh); +int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq(struct gfs2_holder *gh); +void gfs2_glock_dq_wait(struct gfs2_holder *gh); +void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, u16 flags, + struct gfs2_holder *gh); +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \ gfs2_dump_glock(NULL, gl, true); \ @@ -237,7 +213,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \ while (0) -extern __printf(2, 3) +__printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); /** @@ -265,27 +241,27 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, return error; } -extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); -extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); -extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); -extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); -extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); -extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); -extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); -extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); -extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); -extern void gfs2_glock_free(struct gfs2_glock *gl); +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); +void gfs2_cancel_delete_work(struct gfs2_glock *gl); +void gfs2_flush_delete_work(struct gfs2_sbd *sdp); +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); +void gfs2_glock_thaw(struct gfs2_sbd *sdp); +void gfs2_glock_add_to_lru(struct gfs2_glock *gl); +void gfs2_glock_free(struct gfs2_glock *gl); -extern int __init gfs2_glock_init(void); -extern void gfs2_glock_exit(void); +int __init gfs2_glock_init(void); +void gfs2_glock_exit(void); -extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp); -extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); -extern void gfs2_register_debugfs(void); -extern void gfs2_unregister_debugfs(void); +void gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +void gfs2_register_debugfs(void); +void gfs2_unregister_debugfs(void); -extern void glock_set_object(struct gfs2_glock *gl, void *object); -extern void glock_clear_object(struct gfs2_glock *gl, void *object); +void glock_set_object(struct gfs2_glock *gl, void *object); +void glock_clear_object(struct gfs2_glock *gl, void *object); extern const struct lm_lockops gfs2_dlm_ops; @@ -304,7 +280,7 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh) return !list_empty(&gh->gh_list); } -extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); -extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); +void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); +bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54319328b16b..b41c78bd2cc0 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -403,7 +403,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); const struct gfs2_dinode *str = buf; - struct timespec64 atime; + struct timespec64 atime, iatime; u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); struct inode *inode = &ip->i_inode; @@ -433,12 +433,13 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - if (timespec64_compare(&inode->i_atime, &atime) < 0) - inode->i_atime = atime; - inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime); - inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); - inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + iatime = inode_get_atime(inode); + if (timespec64_compare(&iatime, &atime) < 0) + inode_set_atime_to_ts(inode, atime); + inode_set_mtime(inode, be64_to_cpu(str->di_mtime), + be32_to_cpu(str->di_mtime_nsec)); + inode_set_ctime(inode, be64_to_cpu(str->di_ctime), + be32_to_cpu(str->di_ctime_nsec)); ip->i_goal = be64_to_cpu(str->di_goal_meta); ip->i_generation = be64_to_cpu(str->di_generation); @@ -567,15 +568,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote) struct super_block *sb = sdp->sd_vfs; if (!remote || - gl->gl_state != LM_ST_SHARED || + (gl->gl_state != LM_ST_SHARED && + gl->gl_state != LM_ST_UNLOCKED) || gl->gl_demote_state != LM_ST_UNLOCKED) return; /* * Try to get an active super block reference to prevent racing with - * unmount (see trylock_super()). But note that unmount isn't the only - * place where a write lock on s_umount is taken, and we can fail here - * because of things like remount as well. + * unmount (see super_trylock_shared()). But note that unmount isn't + * the only place where a write lock on s_umount is taken, and we can + * fail here because of things like remount as well. */ if (down_read_trylock(&sb->s_umount)) { atomic_inc(&sb->s_active); @@ -613,18 +615,6 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl) } /** - * freeze_go_demote_ok - * @gl: the glock - * - * Always returns 0 - */ - -static int freeze_go_demote_ok(const struct gfs2_glock *gl) -{ - return 0; -} - -/** * iopen_go_callback - schedule the dcache entry for the inode to be deleted * @gl: the glock * @remote: true if this came from a different cluster node @@ -637,7 +627,7 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!remote || sb_rdonly(sdp->sd_vfs) || - test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + test_bit(SDF_KILL, &sdp->sd_flags)) return; if (gl->gl_demote_state == LM_ST_UNLOCKED && @@ -743,7 +733,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { const struct gfs2_glock_operations gfs2_freeze_glops = { .go_xmote_bh = freeze_go_xmote_bh, - .go_demote_ok = freeze_go_demote_ok, .go_callback = freeze_go_callback, .go_type = LM_TYPE_NONDISK, .go_flags = GLOF_NONDISK, diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 695898afcaf1..9341423798df 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -22,7 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; -extern int gfs2_inode_metasync(struct gfs2_glock *gl); -extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); +int gfs2_inode_metasync(struct gfs2_glock *gl); +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 04f2d78e8658..95a334d64da2 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -452,7 +452,7 @@ struct gfs2_quota_data { s64 qd_change_sync; unsigned int qd_slot; - unsigned int qd_slot_count; + unsigned int qd_slot_ref; struct buffer_head *qd_bh; struct gfs2_quota_change *qd_bh_qc; @@ -537,6 +537,7 @@ struct gfs2_statfs_change_host { #define GFS2_QUOTA_OFF 0 #define GFS2_QUOTA_ACCOUNT 1 #define GFS2_QUOTA_ON 2 +#define GFS2_QUOTA_QUIET 3 /* on but not complaining */ #define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED #define GFS2_DATA_WRITEBACK 1 @@ -606,7 +607,7 @@ enum { SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are withdrawing */ - SDF_DEACTIVATING = 15, + SDF_KILL = 15, SDF_EVICTING = 16, SDF_FROZEN = 17, }; @@ -716,7 +717,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_freeze_gl; struct work_struct sd_freeze_work; - wait_queue_head_t sd_glock_wait; + wait_queue_head_t sd_kill_wait; wait_queue_head_t sd_async_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; @@ -862,7 +863,7 @@ static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) preempt_enable(); } -extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); +struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip) { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 17c994a0c0d0..1b95db2c3aac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -185,8 +185,9 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ - inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); - inode->i_atime.tv_nsec = 0; + inode_set_atime(inode, + 1LL << (8 * sizeof(inode_get_atime_sec(inode)) - 1), + 0); glock_set_object(ip->i_gl, ip); @@ -265,21 +266,28 @@ fail_iput: } -struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) +/** + * gfs2_lookup_meta - Look up an inode in a metadata directory + * @dip: The directory + * @name: The name of the inode + */ +struct inode *gfs2_lookup_meta(struct inode *dip, const char *name) { struct qstr qstr; struct inode *inode; + gfs2_str2qstr(&qstr, name); inode = gfs2_lookupi(dip, &qstr, 1); - /* gfs2_lookupi has inconsistent callers: vfs - * related routines expect NULL for no entry found, - * gfs2_lookup_simple callers expect ENOENT - * and do not check for NULL. + if (IS_ERR_OR_NULL(inode)) + return inode ? inode : ERR_PTR(-ENOENT); + + /* + * Must not call back into the filesystem when allocating + * pages in the metadata inode's address space. */ - if (inode == NULL) - return ERR_PTR(-ENOENT); - else - return inode; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + return inode; } @@ -411,7 +419,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) if (error) goto out_ipreserv; - error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1); if (error) goto out_trans_end; @@ -690,7 +698,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); ip->i_goal = dip->i_goal; @@ -1029,7 +1037,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_trans_add_meta(ip->i_gl, dibh); inc_nlink(&ip->i_inode); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); ihold(inode); d_instantiate(dentry, inode); mark_inode_dirty(inode); @@ -1114,7 +1122,7 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip, return error; ip->i_entries = 0; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (S_ISDIR(inode->i_mode)) clear_nlink(inode); else @@ -1371,7 +1379,7 @@ static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, if (dir_rename) return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); mark_inode_dirty_sync(&ip->i_inode); return 0; } @@ -1860,16 +1868,24 @@ out: int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { + int may_not_block = mask & MAY_NOT_BLOCK; struct gfs2_inode *ip; struct gfs2_holder i_gh; + struct gfs2_glock *gl; int error; gfs2_holder_mark_uninitialized(&i_gh); ip = GFS2_I(inode); - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (mask & MAY_NOT_BLOCK) + gl = rcu_dereference_check(ip->i_gl, !may_not_block); + if (unlikely(!gl)) { + /* inode is getting torn down, must be RCU mode */ + WARN_ON_ONCE(!may_not_block); + return -ECHILD; + } + if (gfs2_glock_is_locked_by_me(gl) == NULL) { + if (may_not_block) return -ECHILD; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; } @@ -1914,7 +1930,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) kuid_t ouid, nuid; kgid_t ogid, ngid; int error; - struct gfs2_alloc_parms ap; + struct gfs2_alloc_parms ap = {}; ouid = inode->i_uid; ogid = inode->i_gid; @@ -2071,7 +2087,7 @@ static int gfs2_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); @@ -2139,8 +2155,7 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset) return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); } -static int gfs2_update_time(struct inode *inode, struct timespec64 *time, - int flags) +static int gfs2_update_time(struct inode *inode, int flags) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_glock *gl = ip->i_gl; @@ -2148,14 +2163,15 @@ static int gfs2_update_time(struct inode *inode, struct timespec64 *time, int error; gh = gfs2_glock_is_locked_by_me(gl); - if (gh && !gfs2_glock_is_held_excl(gl)) { + if (gh && gl->gl_state != LM_ST_EXCLUSIVE) { gfs2_glock_dq(gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, gh); error = gfs2_glock_nq(gh); if (error) return error; } - return generic_update_time(inode, time, flags); + generic_update_time(inode, flags); + return 0; } static const struct inode_operations gfs2_file_iops = { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c8c5814e7295..fd15d1c6b6fb 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -13,9 +13,9 @@ #include "util.h" bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask); -extern int gfs2_internal_read(struct gfs2_inode *ip, - char *buf, loff_t *pos, unsigned size); -extern void gfs2_set_aops(struct inode *inode); +ssize_t gfs2_internal_read(struct gfs2_inode *ip, + char *buf, loff_t *pos, size_t size); +void gfs2_set_aops(struct inode *inode); static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { @@ -44,19 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip) static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) { - inode->i_blocks = blocks << - (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); + inode->i_blocks = blocks << (inode->i_blkbits - 9); } static inline u64 gfs2_get_inode_blocks(const struct inode *inode) { - return inode->i_blocks >> - (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); + return inode->i_blocks >> (inode->i_blkbits - 9); } static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) { - change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT; + change <<= inode->i_blkbits - 9; gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change)); inode->i_blocks += change; } @@ -88,33 +86,33 @@ err: return -EIO; } -extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - unsigned int blktype); -extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, - u64 no_formal_ino, - unsigned int blktype); - -extern int gfs2_inode_refresh(struct gfs2_inode *ip); - -extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root); -extern int gfs2_permission(struct mnt_idmap *idmap, - struct inode *inode, int mask); -extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); -extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); -extern int gfs2_open_common(struct inode *inode, struct file *file); -extern loff_t gfs2_seek_data(struct file *file, loff_t offset); -extern loff_t gfs2_seek_hole(struct file *file, loff_t offset); +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + unsigned int blktype); +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 no_formal_ino, + unsigned int blktype); + +int gfs2_inode_refresh(struct gfs2_inode *ip); + +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +int gfs2_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask); +struct inode *gfs2_lookup_meta(struct inode *dip, const char *name); +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +int gfs2_open_common(struct inode *inode, struct file *file); +loff_t gfs2_seek_data(struct file *file, loff_t offset); +loff_t gfs2_seek_hole(struct file *file, loff_t offset); extern const struct file_operations gfs2_file_fops_nolock; extern const struct file_operations gfs2_dir_fops_nolock; -extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); -extern int gfs2_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); -extern void gfs2_set_inode_flags(struct inode *inode); - +int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int gfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa); +void gfs2_set_inode_flags(struct inode *inode); + #ifdef CONFIG_GFS2_FS_LOCKING_DLM extern const struct file_operations gfs2_file_fops; extern const struct file_operations gfs2_dir_fops; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 54911294687c..59ab18c79889 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -222,11 +222,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, lkf |= DLM_LKF_NOQUEUEBAST; } - if (gfs_flags & LM_FLAG_PRIORITY) { - lkf |= DLM_LKF_NOORDER; - lkf |= DLM_LKF_HEADQUE; - } - if (gfs_flags & LM_FLAG_ANY) { if (req == DLM_LOCK_PR) lkf |= DLM_LKF_ALTCW; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index aa568796207c..e5271ae87d1c 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -1227,6 +1227,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_unlock(sdp); } +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return atomic_read(&sdp->sd_log_pinned) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh1); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + return sdp->sd_jdesc->jd_blocks - + atomic_read(&sdp->sd_log_blks_free) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh2); +} + /** * gfs2_log_commit - Commit a transaction to the log * @sdp: the filesystem @@ -1246,9 +1261,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { log_refund(sdp, tr); - if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) || - ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) > - atomic_read(&sdp->sd_log_thresh2))) + if (gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp)) wake_up(&sdp->sd_logd_waitq); } @@ -1271,24 +1284,6 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); } -static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) -{ - return (atomic_read(&sdp->sd_log_pinned) + - atomic_read(&sdp->sd_log_blks_needed) >= - atomic_read(&sdp->sd_log_thresh1)); -} - -static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) -{ - unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); - - if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) - return 1; - - return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= - atomic_read(&sdp->sd_log_thresh2); -} - /** * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks * @data: Pointer to GFS2 superblock @@ -1301,14 +1296,11 @@ int gfs2_logd(void *data) { struct gfs2_sbd *sdp = data; unsigned long t = 1; - DEFINE_WAIT(wait); while (!kthread_should_stop()) { + if (gfs2_withdrawn(sdp)) + break; - if (gfs2_withdrawn(sdp)) { - msleep_interruptible(HZ); - continue; - } /* Check for errors writing to the journal */ if (sdp->sd_log_error) { gfs2_lm(sdp, @@ -1317,7 +1309,7 @@ int gfs2_logd(void *data) "prevent further damage.\n", sdp->sd_fsname, sdp->sd_log_error); gfs2_withdraw(sdp); - continue; + break; } if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { @@ -1326,7 +1318,9 @@ int gfs2_logd(void *data) GFS2_LFC_LOGD_JFLUSH_REQD); } - if (gfs2_ail_flush_reqd(sdp)) { + if (test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || + gfs2_ail_flush_reqd(sdp)) { + clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); gfs2_ail1_start(sdp); gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp, 0); @@ -1338,17 +1332,14 @@ int gfs2_logd(void *data) try_to_freeze(); - do { - prepare_to_wait(&sdp->sd_logd_waitq, &wait, - TASK_INTERRUPTIBLE); - if (!gfs2_ail_flush_reqd(sdp) && - !gfs2_jrnl_flush_reqd(sdp) && - !kthread_should_stop()) - t = schedule_timeout(t); - } while(t && !gfs2_ail_flush_reqd(sdp) && - !gfs2_jrnl_flush_reqd(sdp) && - !kthread_should_stop()); - finish_wait(&sdp->sd_logd_waitq, &wait); + t = wait_event_interruptible_timeout(sdp->sd_logd_waitq, + test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || + gfs2_ail_flush_reqd(sdp) || + gfs2_jrnl_flush_reqd(sdp) || + sdp->sd_log_error || + gfs2_withdrawn(sdp) || + kthread_should_stop(), + t); } return 0; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 653cffcbf869..c27b05099c1e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -70,29 +70,29 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) } } -extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); -extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); -extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); -extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp); -extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); -extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); -extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, - unsigned int *extra_revokes); -extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, - unsigned int *extra_revokes); -extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, - u64 seq, u32 tail, u32 lblock, u32 flags, - blk_opf_t op_flags); -extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, - u32 type); -extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); -extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); -extern void log_flush_wait(struct gfs2_sbd *sdp); +void gfs2_ordered_del_inode(struct gfs2_inode *ip); +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); +void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +bool gfs2_log_is_empty(struct gfs2_sbd *sdp); +void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); +bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + u64 seq, u32 tail, u32 lblock, u32 flags, + blk_opf_t op_flags); +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + u32 type); +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); +void log_flush_wait(struct gfs2_sbd *sdp); -extern int gfs2_logd(void *data); -extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); -extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); -extern void gfs2_ail_drain(struct gfs2_sbd *sdp); +int gfs2_logd(void *data); +void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +void gfs2_glock_remove_revoke(struct gfs2_glock *gl); +void gfs2_flush_revokes(struct gfs2_sbd *sdp); +void gfs2_ail_drain(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 251322b01631..483f69807062 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -456,7 +456,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, * Find the folio with 'index' in the journal's mapping. Search the folio for * the journal head if requested (cleanup == false). Release refs on the * folio so the page cache can reclaim it. We grabbed a - * reference on this folio twice, first when we did a find_or_create_page() + * reference on this folio twice, first when we did a grab_cache_page() * to obtain the folio to add it to the bio and second when we do a * filemap_get_folio() here to get the folio to wait on while I/O on it is being * completed. @@ -481,7 +481,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, if (!*done) *done = gfs2_jhead_pg_srch(jd, head, &folio->page); - /* filemap_get_folio() and the earlier find_or_create_page() */ + /* filemap_get_folio() and the earlier grab_cache_page() */ folio_put_refs(folio, 2); } @@ -535,8 +535,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, for (; block < je->lblock + je->blocks; block++, dblock++) { if (!page) { - page = find_or_create_page(mapping, - block >> shift, GFP_NOFS); + page = grab_cache_page(mapping, block >> shift); if (!page) { ret = -ENOMEM; done = true; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 1412ffba1d44..07890c7b145d 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -11,16 +11,18 @@ #include "incore.h" extern const struct gfs2_log_operations *gfs2_log_ops[]; -extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); -extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); -extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, - struct page *page, unsigned size, unsigned offset, - u64 blkno); -extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); -extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); -extern int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, bool keep_cache); -extern void gfs2_drain_revokes(struct gfs2_sbd *sdp); + +void gfs2_log_incr_head(struct gfs2_sbd *sdp); +u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); +void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno); +void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, bool keep_cache); +void gfs2_drain_revokes(struct gfs2_sbd *sdp); + static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { return sdp->sd_ldptrs; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index afcb32854f14..79be0cdc730c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -147,14 +147,14 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); + error = gfs2_qd_shrinker_init(); if (error) goto fail_shrinker; error = -ENOMEM; - gfs_recovery_wq = alloc_workqueue("gfs_recovery", + gfs2_recovery_wq = alloc_workqueue("gfs2_recovery", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); - if (!gfs_recovery_wq) + if (!gfs2_recovery_wq) goto fail_wq1; gfs2_control_wq = alloc_workqueue("gfs2_control", @@ -162,7 +162,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_wq2; - gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0); + gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0); if (!gfs2_freeze_wq) goto fail_wq3; @@ -194,9 +194,9 @@ fail_mempool: fail_wq3: destroy_workqueue(gfs2_control_wq); fail_wq2: - destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_recovery_wq); fail_wq1: - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); fail_cachep8: @@ -229,12 +229,12 @@ fail_lru: static void __exit exit_gfs2_fs(void) { - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); - destroy_workqueue(gfs_recovery_wq); + destroy_workqueue(gfs2_recovery_wq); destroy_workqueue(gfs2_control_wq); destroy_workqueue(gfs2_freeze_wq); list_lru_destroy(&gfs2_qd_lru); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 924361fa510b..25ceb0805df2 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -115,7 +115,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct page *page; + struct folio *folio; struct buffer_head *bh; unsigned int shift; unsigned long index; @@ -129,36 +129,31 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) bufnum = blkno - (index << shift); /* block buf index within page */ if (create) { - for (;;) { - page = grab_cache_page(mapping, index); - if (page) - break; - yield(); - } - if (!page_has_buffers(page)) - create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(mapping) | __GFP_NOFAIL); + bh = folio_buffers(folio); + if (!bh) + bh = create_empty_buffers(folio, + sdp->sd_sb.sb_bsize, 0); } else { - page = find_get_page_flags(mapping, index, - FGP_LOCK|FGP_ACCESSED); - if (!page) + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (IS_ERR(folio)) return NULL; - if (!page_has_buffers(page)) { - bh = NULL; - goto out_unlock; - } + bh = folio_buffers(folio); } - /* Locate header for our buffer within our page */ - for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) - /* Do nothing */; - get_bh(bh); + if (!bh) + goto out_unlock; + bh = get_nth_bh(bh, bufnum); if (!buffer_mapped(bh)) map_bh(bh, sdp->sd_vfs, blkno); out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return bh; } @@ -405,26 +400,20 @@ static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno) { struct address_space *mapping = ip->i_inode.i_mapping; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct page *page; + struct folio *folio; struct buffer_head *bh; unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; unsigned long index = blkno >> shift; /* convert block to page */ unsigned int bufnum = blkno - (index << shift); - page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED, 0); + if (IS_ERR(folio)) return NULL; - if (!page_has_buffers(page)) { - unlock_page(page); - put_page(page); - return NULL; - } - /* Locate header for our buffer within our page */ - for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) - /* Do nothing */; - get_bh(bh); - unlock_page(page); - put_page(page); + bh = folio_buffers(folio); + if (bh) + bh = get_nth_bh(bh, bufnum); + folio_unlock(folio); + folio_put(folio); return bh; } diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index d0a58cdd433a..831d988c2ceb 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -50,21 +50,21 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) return inode->i_sb->s_fs_info; } -extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); -extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - int rahead, struct buffer_head **bhp); -extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); -extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, - int create); +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + int rahead, struct buffer_head **bhp); +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, + int create); enum { REMOVE_JDATA = 0, REMOVE_META = 1, }; -extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); -extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); -extern int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, - struct buffer_head **bhp); +void gfs2_remove_from_journal(struct buffer_head *bh, int meta); +void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, + struct buffer_head **bhp); static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, struct buffer_head **bhp) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 8a27957dbfee..b108c5d26839 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -87,7 +87,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); - init_waitqueue_head(&sdp->sd_glock_wait); + init_waitqueue_head(&sdp->sd_kill_wait); init_waitqueue_head(&sdp->sd_async_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); init_completion(&sdp->sd_locking_init); @@ -292,8 +292,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) return error; } - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - - GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9; sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) / sizeof(u64); @@ -648,7 +647,7 @@ static int init_statfs(struct gfs2_sbd *sdp) struct gfs2_jdesc *jd; struct gfs2_inode *ip; - sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + sdp->sd_statfs_inode = gfs2_lookup_meta(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); @@ -657,7 +656,7 @@ static int init_statfs(struct gfs2_sbd *sdp) if (sdp->sd_args.ar_spectator) goto out; - pn = gfs2_lookup_simple(master, "per_node"); + pn = gfs2_lookup_meta(master, "per_node"); if (IS_ERR(pn)) { error = PTR_ERR(pn); fs_err(sdp, "can't find per_node directory: %d\n", error); @@ -674,7 +673,7 @@ static int init_statfs(struct gfs2_sbd *sdp) goto free_local; } sprintf(buf, "statfs_change%u", jd->jd_jid); - lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + lsi->si_sc_inode = gfs2_lookup_meta(pn, buf); if (IS_ERR(lsi->si_sc_inode)) { error = PTR_ERR(lsi->si_sc_inode); fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", @@ -739,7 +738,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_statfs; - sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); + sdp->sd_jindex = gfs2_lookup_meta(master, "jindex"); if (IS_ERR(sdp->sd_jindex)) { fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); @@ -888,7 +887,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail; /* Read in the resource index inode */ - sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); + sdp->sd_rindex = gfs2_lookup_meta(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); @@ -897,7 +896,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ - sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); + sdp->sd_quota_inode = gfs2_lookup_meta(master, "quota"); if (IS_ERR(sdp->sd_quota_inode)) { error = PTR_ERR(sdp->sd_quota_inode); fs_err(sdp, "can't get quota file inode: %d\n", error); @@ -941,7 +940,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_qc_gh; - pn = gfs2_lookup_simple(master, "per_node"); + pn = gfs2_lookup_meta(master, "per_node"); if (IS_ERR(pn)) { error = PTR_ERR(pn); fs_err(sdp, "can't find per_node directory: %d\n", error); @@ -949,7 +948,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) } sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); - sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); + sdp->sd_qc_inode = gfs2_lookup_meta(pn, buf); if (IS_ERR(sdp->sd_qc_inode)) { error = PTR_ERR(sdp->sd_qc_inode); fs_err(sdp, "can't find local \"qc\" file: %d\n", error); @@ -1103,29 +1102,46 @@ static int init_threads(struct gfs2_sbd *sdp) struct task_struct *p; int error = 0; - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + p = kthread_create(gfs2_logd, sdp, "gfs2_logd/%s", sdp->sd_fsname); if (IS_ERR(p)) { error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); + fs_err(sdp, "can't create logd thread: %d\n", error); return error; } + get_task_struct(p); sdp->sd_logd_process = p; - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + p = kthread_create(gfs2_quotad, sdp, "gfs2_quotad/%s", sdp->sd_fsname); if (IS_ERR(p)) { error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); + fs_err(sdp, "can't create quotad thread: %d\n", error); goto fail; } + get_task_struct(p); sdp->sd_quotad_process = p; + + wake_up_process(sdp->sd_logd_process); + wake_up_process(sdp->sd_quotad_process); return 0; fail: - kthread_stop(sdp->sd_logd_process); + kthread_stop_put(sdp->sd_logd_process); sdp->sd_logd_process = NULL; return error; } +void gfs2_destroy_threads(struct gfs2_sbd *sdp) +{ + if (sdp->sd_logd_process) { + kthread_stop_put(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; + } + if (sdp->sd_quotad_process) { + kthread_stop_put(sdp->sd_quotad_process); + sdp->sd_quotad_process = NULL; + } +} + /** * gfs2_fill_super - Read in superblock * @sb: The VFS superblock @@ -1170,10 +1186,9 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) /* Set up the buffer cache and fill in some fake block size values to allow us to read-in the on-disk superblock. */ - sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK); + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512); sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - - GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9; sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; @@ -1261,10 +1276,8 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) if (!sb_rdonly(sb)) { error = init_threads(sdp); - if (error) { - gfs2_withdraw_delayed(sdp); + if (error) goto fail_per_node; - } } error = gfs2_freeze_lock_shared(sdp); @@ -1276,12 +1289,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) if (error) { gfs2_freeze_unlock(&sdp->sd_freeze_gh); - if (sdp->sd_quotad_process) - kthread_stop(sdp->sd_quotad_process); - sdp->sd_quotad_process = NULL; - if (sdp->sd_logd_process) - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; + gfs2_destroy_threads(sdp); fs_err(sdp, "can't make FS RW: %d\n", error); goto fail_per_node; } @@ -1381,6 +1389,7 @@ static const struct constant_table gfs2_param_quota[] = { {"off", GFS2_QUOTA_OFF}, {"account", GFS2_QUOTA_ACCOUNT}, {"on", GFS2_QUOTA_ON}, + {"quiet", GFS2_QUOTA_QUIET}, {} }; @@ -1786,9 +1795,9 @@ static void gfs2_kill_sb(struct super_block *sb) /* * Flush and then drain the delete workqueue here (via * destroy_workqueue()) to ensure that any delete work that - * may be running will also see the SDF_DEACTIVATING flag. + * may be running will also see the SDF_KILL flag. */ - set_bit(SDF_DEACTIVATING, &sdp->sd_flags); + set_bit(SDF_KILL, &sdp->sd_flags); gfs2_flush_delete_work(sdp); destroy_workqueue(sdp->sd_delete_wq); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 704192b73605..95dae7838b4e 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -109,38 +109,44 @@ static inline void spin_unlock_bucket(unsigned int hash) static void gfs2_qd_dealloc(struct rcu_head *rcu) { struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + struct gfs2_sbd *sdp = qd->qd_sbd; + kmem_cache_free(gfs2_quotad_cachep, qd); + if (atomic_dec_and_test(&sdp->sd_quota_count)) + wake_up(&sdp->sd_kill_wait); } -static void gfs2_qd_dispose(struct list_head *list) +static void gfs2_qd_dispose(struct gfs2_quota_data *qd) { - struct gfs2_quota_data *qd; - struct gfs2_sbd *sdp; - - while (!list_empty(list)) { - qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); - sdp = qd->qd_gl->gl_name.ln_sbd; - - list_del(&qd->qd_lru); + struct gfs2_sbd *sdp = qd->qd_sbd; - /* Free from the filesystem-specific list */ - spin_lock(&qd_lock); - list_del(&qd->qd_list); - spin_unlock(&qd_lock); + spin_lock(&qd_lock); + list_del(&qd->qd_list); + spin_unlock(&qd_lock); - spin_lock_bucket(qd->qd_hash); - hlist_bl_del_rcu(&qd->qd_hlist); - spin_unlock_bucket(qd->qd_hash); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + if (!gfs2_withdrawn(sdp)) { gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_slot_ref); gfs2_assert_warn(sdp, !qd->qd_bh_count); + } + + gfs2_glock_put(qd->qd_gl); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); +} + +static void gfs2_qd_list_dispose(struct list_head *list) +{ + struct gfs2_quota_data *qd; - gfs2_glock_put(qd->qd_gl); - atomic_dec(&sdp->sd_quota_count); + while (!list_empty(list)) { + qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); + list_del(&qd->qd_lru); - /* Delete it from the common reclaim list */ - call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); + gfs2_qd_dispose(qd); } } @@ -149,18 +155,22 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *dispose = arg; - struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); + struct gfs2_quota_data *qd = + list_entry(item, struct gfs2_quota_data, qd_lru); + enum lru_status status; if (!spin_trylock(&qd->qd_lockref.lock)) return LRU_SKIP; + status = LRU_SKIP; if (qd->qd_lockref.count == 0) { lockref_mark_dead(&qd->qd_lockref); list_lru_isolate_move(lru, &qd->qd_lru, dispose); + status = LRU_REMOVED; } spin_unlock(&qd->qd_lockref.lock); - return LRU_REMOVED; + return status; } static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, @@ -175,7 +185,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, gfs2_qd_isolate, &dispose); - gfs2_qd_dispose(&dispose); + gfs2_qd_list_dispose(&dispose); return freed; } @@ -186,13 +196,26 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } -struct shrinker gfs2_qd_shrinker = { - .count_objects = gfs2_qd_shrink_count, - .scan_objects = gfs2_qd_shrink_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, -}; +static struct shrinker *gfs2_qd_shrinker; + +int __init gfs2_qd_shrinker_init(void) +{ + gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd"); + if (!gfs2_qd_shrinker) + return -ENOMEM; + + gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count; + gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan; + + shrinker_register(gfs2_qd_shrinker); + return 0; +} + +void gfs2_qd_shrinker_exit(void) +{ + shrinker_free(gfs2_qd_shrinker); +} static u64 qd2index(struct gfs2_quota_data *qd) { @@ -203,12 +226,7 @@ static u64 qd2index(struct gfs2_quota_data *qd) static u64 qd2offset(struct gfs2_quota_data *qd) { - u64 offset; - - offset = qd2index(qd); - offset *= sizeof(struct gfs2_quota); - - return offset; + return qd2index(qd) * sizeof(struct gfs2_quota); } static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) @@ -221,7 +239,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str return NULL; qd->qd_sbd = sdp; - qd->qd_lockref.count = 1; + qd->qd_lockref.count = 0; spin_lock_init(&qd->qd_lockref.lock); qd->qd_id = qid; qd->qd_slot = -1; @@ -283,6 +301,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, spin_lock_bucket(hash); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); if (qd == NULL) { + new_qd->qd_lockref.count++; *qdp = new_qd; list_add(&new_qd->qd_list, &sdp->sd_quota_list); hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); @@ -302,20 +321,31 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, static void qd_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); lockref_get(&qd->qd_lockref); } static void qd_put(struct gfs2_quota_data *qd) { + struct gfs2_sbd *sdp; + if (lockref_put_or_lock(&qd->qd_lockref)) return; + BUG_ON(__lockref_is_dead(&qd->qd_lockref)); + sdp = qd->qd_sbd; + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + lockref_mark_dead(&qd->qd_lockref); + spin_unlock(&qd->qd_lockref.lock); + + gfs2_qd_dispose(qd); + return; + } + qd->qd_lockref.count = 0; list_lru_add(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); - } static int slot_get(struct gfs2_quota_data *qd) @@ -325,20 +355,19 @@ static int slot_get(struct gfs2_quota_data *qd) int error = 0; spin_lock(&sdp->sd_bitmap_lock); - if (qd->qd_slot_count != 0) - goto out; - - error = -ENOSPC; - bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); - if (bit < sdp->sd_quota_slots) { + if (qd->qd_slot_ref == 0) { + bit = find_first_zero_bit(sdp->sd_quota_bitmap, + sdp->sd_quota_slots); + if (bit >= sdp->sd_quota_slots) { + error = -ENOSPC; + goto out; + } set_bit(bit, sdp->sd_quota_bitmap); qd->qd_slot = bit; - error = 0; -out: - qd->qd_slot_count++; } + qd->qd_slot_ref++; +out: spin_unlock(&sdp->sd_bitmap_lock); - return error; } @@ -347,8 +376,8 @@ static void slot_hold(struct gfs2_quota_data *qd) struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); - gfs2_assert(sdp, qd->qd_slot_count); - qd->qd_slot_count++; + gfs2_assert(sdp, qd->qd_slot_ref); + qd->qd_slot_ref++; spin_unlock(&sdp->sd_bitmap_lock); } @@ -357,8 +386,8 @@ static void slot_put(struct gfs2_quota_data *qd) struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); - gfs2_assert(sdp, qd->qd_slot_count); - if (!--qd->qd_slot_count) { + gfs2_assert(sdp, qd->qd_slot_ref); + if (!--qd->qd_slot_ref) { BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } @@ -367,7 +396,7 @@ static void slot_put(struct gfs2_quota_data *qd) static int bh_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct inode *inode = sdp->sd_qc_inode; struct gfs2_inode *ip = GFS2_I(inode); unsigned int block, offset; @@ -421,7 +450,7 @@ fail: static void bh_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; mutex_lock(&sdp->sd_quota_mutex); gfs2_assert(sdp, qd->qd_bh_count); @@ -441,6 +470,17 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, (sync_gen && (qd->qd_sync_gen >= *sync_gen))) return 0; + /* + * If qd_change is 0 it means a pending quota change was negated. + * We should not sync it, but we still have a qd reference and slot + * reference taken by gfs2_quota_change -> do_qc that need to be put. + */ + if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) { + slot_put(qd); + qd_put(qd); + return 0; + } + if (!lockref_get_not_dead(&qd->qd_lockref)) return 0; @@ -451,6 +491,20 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, return 1; } +static int qd_bh_get_or_undo(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + int error; + + error = bh_get(qd); + if (!error) + return 0; + + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return error; +} + static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd = NULL, *iter; @@ -473,30 +527,29 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) spin_unlock(&qd_lock); if (qd) { - error = bh_get(qd); - if (error) { - clear_bit(QDF_LOCKED, &qd->qd_flags); - slot_put(qd); - qd_put(qd); + error = qd_bh_get_or_undo(sdp, qd); + if (error) return error; - } + *qdp = qd; } - *qdp = qd; - return 0; } -static void qd_unlock(struct gfs2_quota_data *qd) +static void qdsb_put(struct gfs2_quota_data *qd) { - gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd, - test_bit(QDF_LOCKED, &qd->qd_flags)); - clear_bit(QDF_LOCKED, &qd->qd_flags); bh_put(qd); slot_put(qd); qd_put(qd); } +static void qd_unlock(struct gfs2_quota_data *qd) +{ + gfs2_assert_warn(qd->qd_sbd, test_bit(QDF_LOCKED, &qd->qd_flags)); + clear_bit(QDF_LOCKED, &qd->qd_flags); + qdsb_put(qd); +} + static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { @@ -523,13 +576,6 @@ fail: return error; } -static void qdsb_put(struct gfs2_quota_data *qd) -{ - bh_put(qd); - slot_put(qd); - qd_put(qd); -} - /** * gfs2_qa_get - make sure we have a quota allocations data structure, * if necessary @@ -666,7 +712,7 @@ static int sort_qd(const void *a, const void *b) static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); struct gfs2_quota_change *qc = qd->qd_bh_qc; s64 x; @@ -708,31 +754,30 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type) mutex_unlock(&sdp->sd_quota_mutex); } -static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, +static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, unsigned off, void *buf, unsigned bytes) { + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; struct buffer_head *bh; u64 blk; unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; unsigned to_write = bytes, pg_off = off; - int done = 0; blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); boff = off % bsize; - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) - return -ENOMEM; - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) + return PTR_ERR(folio); + bh = folio_buffers(folio); + if (!bh) + bh = create_empty_buffers(folio, bsize, 0); - bh = page_buffers(page); - while (!done) { - /* Find the beginning block within the page */ + for (;;) { + /* Find the beginning block within the folio */ if (pg_off >= ((bnum * bsize) + bsize)) { bh = bh->b_this_page; bnum++; @@ -745,16 +790,14 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, goto unlock_out; /* If it's a newly allocated disk block, zero it */ if (buffer_new(bh)) - zero_user(page, bnum * bsize, bh->b_size); + folio_zero_range(folio, bnum * bsize, + bh->b_size); } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (bh_read(bh, REQ_META | REQ_PRIO) < 0) goto unlock_out; - if (gfs2_is_jdata(ip)) - gfs2_trans_add_data(ip->i_gl, bh); - else - gfs2_ordered_add_inode(ip); + gfs2_trans_add_data(ip->i_gl, bh); /* If we need to write to the next block as well */ if (to_write > (bsize - boff)) { @@ -763,29 +806,29 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, boff = pg_off % bsize; continue; } - done = 1; + break; } - /* Write to the page, now that we have setup the buffer(s) */ - memcpy_to_page(page, off, buf, bytes); - flush_dcache_page(page); - unlock_page(page); - put_page(page); + /* Write to the folio, now that we have setup the buffer(s) */ + memcpy_to_folio(folio, off, buf, bytes); + flush_dcache_folio(folio); + folio_unlock(folio); + folio_put(folio); return 0; unlock_out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return -EIO; } -static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, +static int gfs2_write_disk_quota(struct gfs2_sbd *sdp, struct gfs2_quota *qp, loff_t loc) { unsigned long pg_beg; unsigned pg_off, nbytes, overflow = 0; - int pg_oflow = 0, error; + int error; void *ptr; nbytes = sizeof(struct gfs2_quota); @@ -794,17 +837,15 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, pg_off = offset_in_page(loc); /* If the quota straddles a page boundary, split the write in two */ - if ((pg_off + nbytes) > PAGE_SIZE) { - pg_oflow = 1; + if ((pg_off + nbytes) > PAGE_SIZE) overflow = (pg_off + nbytes) - PAGE_SIZE; - } ptr = qp; - error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr, + error = gfs2_write_buf_to_page(sdp, pg_beg, pg_off, ptr, nbytes - overflow); /* If there's an overflow, write the remaining bytes to the next page */ - if (!error && pg_oflow) - error = gfs2_write_buf_to_page(ip, pg_beg + 1, 0, + if (!error && overflow) + error = gfs2_write_buf_to_page(sdp, pg_beg + 1, 0, ptr + nbytes - overflow, overflow); return error; @@ -812,7 +853,7 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, /** * gfs2_adjust_quota - adjust record of current block usage - * @ip: The quota inode + * @sdp: The superblock * @loc: Offset of the entry in the quota file * @change: The amount of usage change to record * @qd: The quota data @@ -824,12 +865,12 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, * Returns: 0 or -ve on error */ -static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, +static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc, s64 change, struct gfs2_quota_data *qd, struct qc_dqblk *fdq) { + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_quota q; int err; u64 size; @@ -846,7 +887,6 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, return err; loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ - err = -EIO; be64_add_cpu(&q.qu_value, change); if (((s64)be64_to_cpu(q.qu_value)) < 0) q.qu_value = 0; /* Never go negative on quota usage */ @@ -866,12 +906,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, } } - err = gfs2_write_disk_quota(ip, &q, loc); + err = gfs2_write_disk_quota(sdp, &q, loc); if (!err) { size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); - inode->i_mtime = inode->i_atime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); set_bit(QDF_REFRESH, &qd->qd_flags); } @@ -881,9 +921,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) { - struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = (*qda)->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; unsigned int data_blocks, ind_blocks; struct gfs2_holder *ghs, i_gh; unsigned int qx, x; @@ -893,18 +933,12 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) unsigned int nalloc = 0, blocks; int error; - error = gfs2_qa_get(ip); - if (error) - return error; - gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); - if (!ghs) { - error = -ENOMEM; - goto out; - } + if (!ghs) + return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); inode_lock(&ip->i_inode); @@ -953,7 +987,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) for (x = 0; x < num_qd; x++) { qd = qda[x]; offset = qd2offset(qd); - error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL); + error = gfs2_adjust_quota(sdp, offset, qd->qd_change_sync, qd, + NULL); if (error) goto out_end_trans; @@ -961,8 +996,6 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) set_bit(QDF_REFRESH, &qd->qd_flags); } - error = 0; - out_end_trans: gfs2_trans_end(sdp); out_ipres: @@ -976,8 +1009,10 @@ out_dq: kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC); -out: - gfs2_qa_put(ip); + if (!error) { + for (x = 0; x < num_qd; x++) + qda[x]->qd_sync_gen = sdp->sd_quota_sync_gen; + } return error; } @@ -1009,11 +1044,12 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder *q_gh) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; int error; + gfs2_assert_warn(sdp, sdp == qd->qd_gl->gl_name.ln_sbd); restart: error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); if (error) @@ -1059,9 +1095,9 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; u32 x; - int error = 0; + int error; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; error = gfs2_quota_hold(ip, uid, gid); @@ -1089,16 +1125,15 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) return error; } -static int need_sync(struct gfs2_quota_data *qd) +static bool need_sync(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_tune *gt = &sdp->sd_tune; s64 value; unsigned int num, den; - int do_sync = 1; if (!qd->qd_qb.qb_limit) - return 0; + return false; spin_lock(&qd_lock); value = qd->qd_change; @@ -1109,26 +1144,26 @@ static int need_sync(struct gfs2_quota_data *qd) den = gt->gt_quota_scale_den; spin_unlock(>->gt_spin); - if (value < 0) - do_sync = 0; + if (value <= 0) + return false; else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >= (s64)be64_to_cpu(qd->qd_qb.qb_limit)) - do_sync = 0; + return false; else { value *= gfs2_jindex_size(sdp) * num; value = div_s64(value, den); value += (s64)be64_to_cpu(qd->qd_qb.qb_value); if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) - do_sync = 0; + return false; } - return do_sync; + return true; } void gfs2_quota_unlock(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_quota_data *qda[4]; + struct gfs2_quota_data *qda[2 * GFS2_MAXQUOTAS]; unsigned int count = 0; u32 x; int found; @@ -1138,7 +1173,7 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; - int sync; + bool sync; qd = ip->i_qadata->qa_qd[x]; sync = need_sync(qd); @@ -1154,15 +1189,8 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) if (!found) continue; - gfs2_assert_warn(sdp, qd->qd_change_sync); - if (bh_get(qd)) { - clear_bit(QDF_LOCKED, &qd->qd_flags); - slot_put(qd); - qd_put(qd); - continue; - } - - qda[count++] = qd; + if (!qd_bh_get_or_undo(sdp, qd)) + qda[count++] = qd; } if (count) { @@ -1176,16 +1204,16 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) #define MAX_LINE 256 -static int print_message(struct gfs2_quota_data *qd, char *type) +static void print_message(struct gfs2_quota_data *qd, char *type) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; - - fs_info(sdp, "quota %s for %s %u\n", - type, - (qd->qd_id.type == USRQUOTA) ? "user" : "group", - from_kqid(&init_user_ns, qd->qd_id)); + struct gfs2_sbd *sdp = qd->qd_sbd; - return 0; + if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) { + fs_info(sdp, "quota %s for %s %u\n", + type, + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); + } } /** @@ -1255,7 +1283,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, * HZ)) { quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); - error = print_message(qd, "warning"); + print_message(qd, "warning"); + error = 0; qd->qd_last_warn = jiffies; } } @@ -1269,7 +1298,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, u32 x; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON || + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF || gfs2_assert_warn(sdp, change)) return; if (ip->i_diskflags & GFS2_DIF_SYSTEM) @@ -1288,6 +1317,24 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, } } +static bool qd_changed(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data *qd; + bool changed = false; + + spin_lock(&qd_lock); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags)) + continue; + + changed = true; + break; + } + spin_unlock(&qd_lock); + return changed; +} + int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; @@ -1297,6 +1344,9 @@ int gfs2_quota_sync(struct super_block *sb, int type) unsigned int x; int error = 0; + if (!qd_changed(sdp)) + return 0; + qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); if (!qda) return -ENOMEM; @@ -1318,10 +1368,6 @@ int gfs2_quota_sync(struct super_block *sb, int type) if (num_qd) { if (!error) error = do_sync(num_qd, qda); - if (!error) - for (x = 0; x < num_qd; x++) - qda[x]->qd_sync_gen = - sdp->sd_quota_sync_gen; for (x = 0; x < num_qd; x++) qd_unlock(qda[x]); @@ -1423,7 +1469,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) set_bit(QDF_CHANGE, &qd->qd_flags); qd->qd_change = qc_change; qd->qd_slot = slot; - qd->qd_slot_count = 1; + qd->qd_slot_ref = 1; spin_lock(&qd_lock); BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); @@ -1455,36 +1501,35 @@ fail: void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { - struct list_head *head = &sdp->sd_quota_list; struct gfs2_quota_data *qd; + LIST_HEAD(dispose); + int count; - spin_lock(&qd_lock); - while (!list_empty(head)) { - qd = list_last_entry(head, struct gfs2_quota_data, qd_list); + BUG_ON(test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); - list_del(&qd->qd_list); + spin_lock(&qd_lock); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + spin_lock(&qd->qd_lockref.lock); + if (qd->qd_lockref.count != 0) { + spin_unlock(&qd->qd_lockref.lock); + continue; + } + lockref_mark_dead(&qd->qd_lockref); + spin_unlock(&qd->qd_lockref.lock); - /* Also remove if this qd exists in the reclaim list */ list_lru_del(&gfs2_qd_lru, &qd->qd_lru); - atomic_dec(&sdp->sd_quota_count); - spin_unlock(&qd_lock); - - spin_lock_bucket(qd->qd_hash); - hlist_bl_del_rcu(&qd->qd_hlist); - spin_unlock_bucket(qd->qd_hash); - - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - gfs2_assert_warn(sdp, !qd->qd_bh_count); - - gfs2_glock_put(qd->qd_gl); - call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); - - spin_lock(&qd_lock); + list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); - gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + gfs2_qd_list_dispose(&dispose); + + wait_event_timeout(sdp->sd_kill_wait, + (count = atomic_read(&sdp->sd_quota_count)) == 0, + HZ * 60); + + if (count != 0) + fs_err(sdp, "%d left-over quota data objects\n", count); kvfree(sdp->sd_quota_bitmap); sdp->sd_quota_bitmap = NULL; @@ -1536,12 +1581,11 @@ int gfs2_quotad(void *data) unsigned long statfs_timeo = 0; unsigned long quotad_timeo = 0; unsigned long t = 0; - DEFINE_WAIT(wait); while (!kthread_should_stop()) { - if (gfs2_withdrawn(sdp)) - goto bypass; + break; + /* Update the master statfs file */ if (sdp->sd_statfs_force_sync) { int error = gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -1559,15 +1603,16 @@ int gfs2_quotad(void *data) try_to_freeze(); -bypass: t = min(quotad_timeo, statfs_timeo); - prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE); - if (!sdp->sd_statfs_force_sync) - t -= schedule_timeout(t); - else + t = wait_event_interruptible_timeout(sdp->sd_quota_wait, + sdp->sd_statfs_force_sync || + gfs2_withdrawn(sdp) || + kthread_should_stop(), + t); + + if (sdp->sd_statfs_force_sync) t = 0; - finish_wait(&sdp->sd_quota_wait, &wait); } return 0; @@ -1580,6 +1625,8 @@ static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) memset(state, 0, sizeof(*state)); switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_QUIET: + fallthrough; case GFS2_QUOTA_ON: state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; @@ -1708,7 +1755,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { - struct gfs2_alloc_parms ap = { .aflags = 0, }; + struct gfs2_alloc_parms ap = {}; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; @@ -1726,7 +1773,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, goto out_release; /* Apply changes */ - error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + error = gfs2_adjust_quota(sdp, offset, 0, qd, fdq); if (!error) clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 21ada332d555..f462d9cb3087 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -15,27 +15,27 @@ struct gfs2_sbd; #define NO_UID_QUOTA_CHANGE INVALID_UID #define NO_GID_QUOTA_CHANGE INVALID_GID -extern int gfs2_qa_get(struct gfs2_inode *ip); -extern void gfs2_qa_put(struct gfs2_inode *ip); -extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); -extern void gfs2_quota_unhold(struct gfs2_inode *ip); +int gfs2_qa_get(struct gfs2_inode *ip); +void gfs2_qa_put(struct gfs2_inode *ip); +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +void gfs2_quota_unhold(struct gfs2_inode *ip); -extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); -extern void gfs2_quota_unlock(struct gfs2_inode *ip); +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +void gfs2_quota_unlock(struct gfs2_inode *ip); -extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, - struct gfs2_alloc_parms *ap); -extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - kuid_t uid, kgid_t gid); +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap); +void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + kuid_t uid, kgid_t gid); -extern int gfs2_quota_sync(struct super_block *sb, int type); -extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); +int gfs2_quota_sync(struct super_block *sb, int type); +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); -extern int gfs2_quota_init(struct gfs2_sbd *sdp); -extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); -extern int gfs2_quotad(void *data); +int gfs2_quota_init(struct gfs2_sbd *sdp); +void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +int gfs2_quotad(void *data); -extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); +void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) @@ -50,7 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota == GFS2_QUOTA_ACCOUNT) return 0; ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); if (ret) @@ -59,8 +59,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, } extern const struct quotactl_ops gfs2_quotactl_ops; -extern struct shrinker gfs2_qd_shrinker; +int __init gfs2_qd_shrinker_init(void); +void gfs2_qd_shrinker_exit(void); extern struct list_lru gfs2_qd_lru; -extern void __init gfs2_quota_hash_init(void); + +void __init gfs2_quota_hash_init(void); #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 9c7a9f640bad..5aae02669a40 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -27,7 +27,7 @@ #include "util.h" #include "dir.h" -struct workqueue_struct *gfs_recovery_wq; +struct workqueue_struct *gfs2_recovery_wq; int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) @@ -570,7 +570,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) return -EBUSY; /* we have JDF_RECOVERY, queue should always succeed */ - rv = queue_work(gfs_recovery_wq, &jd->jd_work); + rv = queue_work(gfs2_recovery_wq, &jd->jd_work); BUG_ON(!rv); if (wait) diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 0d30f8e804f4..6a0fd42e1120 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -9,7 +9,7 @@ #include "incore.h" -extern struct workqueue_struct *gfs_recovery_wq; +extern struct workqueue_struct *gfs2_recovery_wq; static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk) { @@ -17,18 +17,18 @@ static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk) *blk = 0; } -extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh); -extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); -extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); -extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); +int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +void gfs2_revoke_clean(struct gfs2_jdesc *jd); -extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); -extern void gfs2_recover_func(struct work_struct *work); -extern int __get_log_header(struct gfs2_sbd *sdp, - const struct gfs2_log_header *lh, unsigned int blkno, - struct gfs2_log_header_host *head); +int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +void gfs2_recover_func(struct work_struct *work); +int __get_log_header(struct gfs2_sbd *sdp, + const struct gfs2_log_header *lh, unsigned int blkno, + struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9308190895c8..c2060203b98a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2411,13 +2411,12 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, * @bn: Used to return the starting block number * @nblocks: requested number of blocks/extent length (value/result) * @dinode: 1 if we're allocating a dinode block, else 0 - * @generation: the generation number of the inode * * Returns: 0 or error */ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, - bool dinode, u64 *generation) + bool dinode) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; @@ -2477,10 +2476,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, rbm.rgd->rd_free -= *nblocks; spin_unlock(&rbm.rgd->rd_rsspin); if (dinode) { + u64 generation; + rbm.rgd->rd_dinodes++; - *generation = rbm.rgd->rd_igeneration++; - if (*generation == 0) - *generation = rbm.rgd->rd_igeneration++; + generation = rbm.rgd->rd_igeneration++; + if (generation == 0) + generation = rbm.rgd->rd_igeneration++; + ip->i_generation = generation; } gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 00b30cf893af..8d20e99385db 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -22,38 +22,38 @@ struct gfs2_rgrpd; struct gfs2_sbd; struct gfs2_holder; -extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); -extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); -extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); -extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -extern int gfs2_rindex_update(struct gfs2_sbd *sdp); -extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); -extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl); -extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +int gfs2_rindex_update(struct gfs2_sbd *sdp); +void gfs2_free_clones(struct gfs2_rgrpd *rgd); +int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl); +void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); -extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); +struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); #define GFS2_AF_ORLOV 1 -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, - struct gfs2_alloc_parms *ap); -extern void gfs2_inplace_release(struct gfs2_inode *ip); - -extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, - bool dinode, u64 *generation); - -extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip); -extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, - u64 bstart, u32 blen, int meta); -extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, - u64 bstart, u32 blen); -extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); -extern void gfs2_unlink_di(struct inode *inode); -extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, - unsigned int type); +int gfs2_inplace_reserve(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap); +void gfs2_inplace_release(struct gfs2_inode *ip); + +int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, + bool dinode); + +void gfs2_rs_deltree(struct gfs2_blkreserv *rs); +void gfs2_rs_delete(struct gfs2_inode *ip); +void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, int meta); +void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen); +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +void gfs2_unlink_di(struct inode *inode); +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); struct gfs2_rgrp_list { unsigned int rl_rgrps; @@ -62,18 +62,19 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, - u64 block); -extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, - unsigned int state, u16 flags); -extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); -extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, - const char *fs_id_buf); -extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, - struct buffer_head *bh, - const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); -extern int gfs2_fitrim(struct file *filp, void __user *argp); +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, + u64 block); +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, + unsigned int state, u16 flags); +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +u64 gfs2_ri_total(struct gfs2_sbd *sdp); +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, + const char *fs_id_buf); +int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, + u64 *ptrimmed); +int gfs2_fitrim(struct file *filp, void __user *argp); /* This is how to tell if a reservation is in the rgrp tree: */ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) @@ -88,9 +89,9 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) return first <= block && block < last; } -extern void check_and_update_goal(struct gfs2_inode *ip); +void check_and_update_goal(struct gfs2_inode *ip); -extern void rgrp_lock_local(struct gfs2_rgrpd *rgd); -extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd); +void rgrp_lock_local(struct gfs2_rgrpd *rgd); +void rgrp_unlock_local(struct gfs2_rgrpd *rgd); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9f4d5d6549ee..d21c04a22d73 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -410,9 +410,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_nlink = cpu_to_be32(inode->i_nlink); str->di_size = cpu_to_be64(i_size_read(inode)); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); - str->di_atime = cpu_to_be64(inode->i_atime.tv_sec); - str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec); + str->di_atime = cpu_to_be64(inode_get_atime_sec(inode)); + str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode)); + str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode)); str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); @@ -427,9 +427,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_entries = cpu_to_be32(ip->i_entries); str->di_eattr = cpu_to_be64(ip->i_eattr); - str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec); - str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode)); + str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode)); + str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode)); } /** @@ -546,20 +546,10 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) { int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - if (!test_bit(SDF_DEACTIVATING, &sdp->sd_flags)) + if (!test_bit(SDF_KILL, &sdp->sd_flags)) gfs2_flush_delete_work(sdp); - if (!log_write_allowed && current == sdp->sd_quotad_process) - fs_warn(sdp, "The quotad daemon is withdrawing.\n"); - else if (sdp->sd_quotad_process) - kthread_stop(sdp->sd_quotad_process); - sdp->sd_quotad_process = NULL; - - if (!log_write_allowed && current == sdp->sd_logd_process) - fs_warn(sdp, "The logd daemon is withdrawing.\n"); - else if (sdp->sd_logd_process) - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; + gfs2_destroy_threads(sdp); if (log_write_allowed) { gfs2_quota_sync(sdp->sd_vfs, 0); @@ -580,15 +570,8 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_log_is_empty(sdp), HZ * 5); gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp)); - } else { - wait_event_timeout(sdp->sd_log_waitq, - gfs2_log_is_empty(sdp), - HZ * 5); } gfs2_quota_cleanup(sdp); - - if (!log_write_allowed) - sdp->sd_vfs->s_flags |= SB_RDONLY; } /** @@ -619,9 +602,15 @@ restart: } spin_unlock(&sdp->sd_jindex_spin); - if (!sb_rdonly(sb)) { + if (!sb_rdonly(sb)) gfs2_make_fs_ro(sdp); + else { + if (gfs2_withdrawn(sdp)) + gfs2_destroy_threads(sdp); + + gfs2_quota_cleanup(sdp); } + WARN_ON(gfs2_withdrawing(sdp)); /* At this point, we're through modifying the disk */ @@ -689,7 +678,7 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp) struct super_block *sb = sdp->sd_vfs; int error; - error = freeze_super(sb); + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); if (error) return error; @@ -697,7 +686,9 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | GFS2_LFC_FREEZE_GO_SYNC); if (gfs2_withdrawn(sdp)) { - thaw_super(sb); + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + if (error) + return error; return -EIO; } } @@ -712,7 +703,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp) error = gfs2_freeze_lock_shared(sdp); if (error) goto fail; - error = thaw_super(sb); + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (!error) return 0; @@ -761,7 +752,7 @@ out: * */ -static int gfs2_freeze_super(struct super_block *sb) +static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; @@ -816,7 +807,7 @@ out: * */ -static int gfs2_thaw_super(struct super_block *sb) +static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; @@ -1017,6 +1008,7 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sc.sc_dinodes + sc.sc_free; buf->f_ffree = sc.sc_free; buf->f_namelen = GFS2_FNAMESIZE; + buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); return 0; } @@ -1132,6 +1124,9 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) case GFS2_QUOTA_ON: state = "on"; break; + case GFS2_QUOTA_QUIET: + state = "quiet"; + break; default: state = "unknown"; break; @@ -1307,18 +1302,8 @@ static bool gfs2_upgrade_iopen_glock(struct inode *inode) * As a last resort, if another node keeps holding the iopen glock * without showing any activity on the inode glock, we will eventually * time out and fail the iopen glock upgrade. - * - * Note that we're passing the LM_FLAG_TRY_1CB flag to the first - * locking request as an optimization to notify lock holders as soon as - * possible. Without that flag, they'd be notified implicitly by the - * second locking request. */ - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh); - error = gfs2_glock_nq(gh); - if (error != GLR_TRYFAILED) - return !error; - gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh); error = gfs2_glock_nq(gh); if (error) @@ -1558,7 +1543,7 @@ out: wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put_eventually(ip->i_gl); - ip->i_gl = NULL; + rcu_assign_pointer(ip->i_gl, NULL); } } @@ -1584,7 +1569,7 @@ static void gfs2_free_inode(struct inode *inode) kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } -extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) +void free_local_statfs_inodes(struct gfs2_sbd *sdp) { struct local_statfs_inode *lsi, *safe; @@ -1599,8 +1584,8 @@ extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) } } -extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, - unsigned int index) +struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index) { struct local_statfs_inode *lsi; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index bba58629bc45..b27a774d9580 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -15,7 +15,7 @@ #define GFS2_FS_FORMAT_MIN (1801) #define GFS2_FS_FORMAT_MAX (1802) -extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); +void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) { @@ -26,32 +26,33 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -extern void gfs2_jindex_free(struct gfs2_sbd *sdp); +void gfs2_jindex_free(struct gfs2_sbd *sdp); -extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); -extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); -extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, - struct gfs2_inode **ipp); +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +int gfs2_jdesc_check(struct gfs2_jdesc *jd); +int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); -extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp); -extern void gfs2_online_uevent(struct gfs2_sbd *sdp); -extern int gfs2_statfs_init(struct gfs2_sbd *sdp); -extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, - s64 dinodes); -extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, - const void *buf); -extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, - void *buf); -extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); -extern int gfs2_statfs_sync(struct super_block *sb, int type); -extern void gfs2_freeze_func(struct work_struct *work); -extern void gfs2_thaw_freeze_initiator(struct super_block *sb); +int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +void gfs2_make_fs_ro(struct gfs2_sbd *sdp); +void gfs2_online_uevent(struct gfs2_sbd *sdp); +void gfs2_destroy_threads(struct gfs2_sbd *sdp); +int gfs2_statfs_init(struct gfs2_sbd *sdp); +void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); +int gfs2_statfs_sync(struct super_block *sb, int type); +void gfs2_freeze_func(struct work_struct *work); +void gfs2_thaw_freeze_initiator(struct super_block *sb); -extern void free_local_statfs_inodes(struct gfs2_sbd *sdp); -extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, - unsigned int index); -extern void free_sbd(struct gfs2_sbd *sdp); +void free_local_statfs_inodes(struct gfs2_sbd *sdp); +struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index); +void free_sbd(struct gfs2_sbd *sdp); extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; @@ -59,8 +60,8 @@ extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; -extern const struct xattr_handler *gfs2_xattr_handlers_max[]; -extern const struct xattr_handler **gfs2_xattr_handlers_min; +extern const struct xattr_handler * const gfs2_xattr_handlers_max[]; +extern const struct xattr_handler * const *gfs2_xattr_handlers_min; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 2dfbe2f188dd..60a0206890c5 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -98,7 +98,10 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) "sd_log_flush_head: %d\n" "sd_log_flush_tail: %d\n" "sd_log_blks_reserved: %d\n" - "sd_log_revokes_available: %d\n", + "sd_log_revokes_available: %d\n" + "sd_log_pinned: %d\n" + "sd_log_thresh1: %d\n" + "sd_log_thresh2: %d\n", test_bit(SDF_JOURNAL_CHECKED, &f), test_bit(SDF_JOURNAL_LIVE, &f), (sdp->sd_jdesc ? sdp->sd_jdesc->jd_jid : 0), @@ -118,7 +121,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) test_bit(SDF_WITHDRAW_IN_PROG, &f), test_bit(SDF_REMOTE_WITHDRAW, &f), test_bit(SDF_WITHDRAW_RECOVERY, &f), - test_bit(SDF_DEACTIVATING, &f), + test_bit(SDF_KILL, &f), sdp->sd_log_error, rwsem_is_locked(&sdp->sd_log_flush_lock), sdp->sd_log_num_revoke, @@ -128,7 +131,10 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) sdp->sd_log_flush_head, sdp->sd_log_flush_tail, sdp->sd_log_blks_reserved, - atomic_read(&sdp->sd_log_revokes_available)); + atomic_read(&sdp->sd_log_revokes_available), + atomic_read(&sdp->sd_log_pinned), + atomic_read(&sdp->sd_log_thresh1), + atomic_read(&sdp->sd_log_thresh2)); return s; } @@ -168,10 +174,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) switch (n) { case 0: - error = thaw_super(sdp->sd_vfs); + error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); break; case 1: - error = freeze_super(sdp->sd_vfs); + error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); break; default: return -EINVAL; diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index c76ad9a4c75a..f8ce5302280d 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -34,17 +34,17 @@ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned return rgd->rd_length; } -extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, - unsigned int blocks, unsigned int revokes, - unsigned long ip); -extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes); - -extern void gfs2_trans_end(struct gfs2_sbd *sdp); -extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); -extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); -extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); -extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); -extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr); +int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip); +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); + +void gfs2_trans_end(struct gfs2_sbd *sdp); +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); +void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr); #endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index dac22b1c1a2e..da29fafb6272 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -9,6 +9,7 @@ #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/kthread.h> #include <linux/crc32.h> #include <linux/gfs2_ondisk.h> #include <linux/delay.h> @@ -150,7 +151,14 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) if (!sb_rdonly(sdp->sd_vfs)) { bool locked = mutex_trylock(&sdp->sd_freeze_mutex); - gfs2_make_fs_ro(sdp); + wake_up(&sdp->sd_logd_waitq); + wake_up(&sdp->sd_quota_wait); + + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), + HZ * 5); + + sdp->sd_vfs->s_flags |= SB_RDONLY; if (locked) mutex_unlock(&sdp->sd_freeze_mutex); @@ -315,19 +323,19 @@ int gfs2_withdraw(struct gfs2_sbd *sdp) struct lm_lockstruct *ls = &sdp->sd_lockstruct; const struct lm_lockops *lm = ls->ls_ops; - if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && - test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { - if (!test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags)) - return -1; - - wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG, - TASK_UNINTERRUPTIBLE); - return -1; - } - - set_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); - if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + unsigned long old = READ_ONCE(sdp->sd_flags), new; + + do { + if (old & BIT(SDF_WITHDRAWN)) { + wait_on_bit(&sdp->sd_flags, + SDF_WITHDRAW_IN_PROG, + TASK_UNINTERRUPTIBLE); + return -1; + } + new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG); + } while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new))); + fs_err(sdp, "about to withdraw this file system\n"); BUG_ON(sdp->sd_args.ar_debug); diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index cdb839529175..11c9d59b6889 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -147,10 +147,10 @@ static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, unsigned int line); -extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, - bool verbose); -extern int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp); -extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); +int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + bool verbose); +int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp); +void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); #define gfs2_io_error(sdp) \ gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 93b36d026bb4..8c96ba6230d1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -311,7 +311,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, ea->ea_num_ptrs = 0; } - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); @@ -639,7 +639,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) u64 block; int error; - error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, block, 1); @@ -701,7 +701,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, int mh_size = sizeof(struct gfs2_meta_header); unsigned int n = 1; - error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &block, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, block, 1); @@ -763,7 +763,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) goto out_end_trans; - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); out_end_trans: @@ -888,7 +888,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (es->es_el) ea_set_remove_stuffed(ip, es->es_el); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -1002,7 +1002,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, } else { u64 blk; unsigned int n = 1; - error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL); + error = gfs2_alloc_blocks(ip, &blk, &n, 0); if (error) return error; gfs2_trans_remove_revoke(sdp, blk, 1); @@ -1106,7 +1106,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) ea->ea_type = GFS2_EATYPE_UNUSED; } - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -1494,7 +1494,7 @@ static const struct xattr_handler gfs2_xattr_trusted_handler = { .set = gfs2_xattr_set, }; -const struct xattr_handler *gfs2_xattr_handlers_max[] = { +const struct xattr_handler * const gfs2_xattr_handlers_max[] = { /* GFS2_FS_FORMAT_MAX */ &gfs2_xattr_trusted_handler, @@ -1504,4 +1504,4 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = { NULL, }; -const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1; +const struct xattr_handler * const *gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1; diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index 2aed9d7d483d..eb12eb7e37c1 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -50,14 +50,14 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -extern int __gfs2_xattr_set(struct inode *inode, const char *name, - const void *value, size_t size, - int flags, int type); -extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern int gfs2_ea_dealloc(struct gfs2_inode *ip); +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ -extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); +int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); #endif /* __EATTR_DOT_H__ */ diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index d985066006d5..5ea5cd8ecea9 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -2,6 +2,7 @@ config HFS_FS tristate "Apple Macintosh file system support" depends on BLOCK + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 6341bb248247..f8395cdd1adf 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -146,7 +146,7 @@ static const struct xattr_handler hfs_type_handler = { .set = hfs_xattr_set, }; -const struct xattr_handler *hfs_xattr_handlers[] = { +const struct xattr_handler * const hfs_xattr_handlers[] = { &hfs_creator_handler, &hfs_type_handler, NULL diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index d365bf0b8c77..d63880e7d9d6 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i goto err1; dir->i_size++; - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); hfs_find_exit(&fd); return 0; @@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) } dir->i_size--; - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); res = 0; out: @@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; dst_dir->i_size++; - dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); + inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir)); mark_inode_dirty(dst_dir); /* finally remove the old entry */ @@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; src_dir->i_size--; - src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); + inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir)); mark_inode_dirty(src_dir); type = entry.type; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 3e1e3dcf0b48..b75c26045df4 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -263,7 +263,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) if (res) return res; clear_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 49d02524e667..b5a6ad5df357 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -215,7 +215,7 @@ extern void hfs_evict_inode(struct inode *); extern void hfs_delete_inode(struct inode *); /* attr.c */ -extern const struct xattr_handler *hfs_xattr_handlers[]; +extern const struct xattr_handler * const hfs_xattr_handlers[]; /* mdb.c */ extern int hfs_mdb_get(struct super_block *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 441d7fc952e3..a7bc4690a780 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; @@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode |= S_IWUGO; inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; - inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->file.MdDat); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat)))); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_size = be16_to_cpu(rec->dir.Val) + 2; HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); - inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->dir.MdDat); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat)))); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; @@ -474,7 +474,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) be32_to_cpu(rec.dir.DirID) != inode->i_ino) { } - rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime); + rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode)); rec.dir.Val = cpu_to_be16(inode->i_size - 2); hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, @@ -502,7 +502,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) else rec.file.Flags |= HFS_FIL_LOCK; hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen); - rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime); + rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode)); hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); @@ -654,8 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, truncate_setsize(inode, attr->ia_size); hfs_file_truncate(inode); - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 2875961fdc10..76fa02e3835b 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -28,9 +28,13 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) /* fix up inode on a timezone change */ diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest; if (diff) { - inode->i_ctime.tv_sec += diff; - inode->i_atime.tv_sec += diff; - inode->i_mtime.tv_sec += diff; + struct timespec64 ts = inode_get_ctime(inode); + + inode_set_ctime(inode, ts.tv_sec + diff, ts.tv_nsec); + ts = inode_get_atime(inode); + inode_set_atime(inode, ts.tv_sec + diff, ts.tv_nsec); + ts = inode_get_mtime(inode); + inode_set_mtime(inode, ts.tv_sec + diff, ts.tv_nsec); HFS_I(inode)->tz_secondswest += diff; } return 1; diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 8034e7827a69..8ce4a33a9ac7 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -2,6 +2,7 @@ config HFSPLUS_FS tristate "Apple Extended HFS file system support" depends on BLOCK + select BUFFER_HEAD select NLS select NLS_UTF8 select LEGACY_DIRECT_IO diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 35472cba750e..1995bafee839 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, dir->i_size++; if (S_ISDIR(inode->i_mode)) hfsplus_subfolders_inc(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); hfs_find_exit(&fd); @@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { @@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_size++; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_inc(dst_dir); - dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); + inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir)); /* finally remove the old entry */ err = hfsplus_cat_build_key(sb, src_fd.search_key, @@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(src_dir); - src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); + inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir)); /* remove old thread entry */ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 56fb5f1312e7..f5c4b3e31a1c 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -346,7 +346,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); ihold(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); sbi->file_count++; hfsplus_mark_mdb_dirty(dst_dir->i_sb); @@ -405,7 +405,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) hfsplus_delete_inode(inode); } else sbi->file_count--; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); out: mutex_unlock(&sbi->vh_mutex); @@ -426,7 +426,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) if (res) goto out; clear_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); hfsplus_delete_inode(inode); mark_inode_dirty(inode); out: diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 7a542f3dbe50..3c572e44f2ad 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -448,9 +448,9 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) if (sbi->alloc_file->i_size * 8 < sbi->total_blocks - sbi->free_blocks + 8) { /* extend alloc file */ - pr_err("extend alloc file! (%llu,%u,%u)\n", - sbi->alloc_file->i_size * 8, - sbi->total_blocks, sbi->free_blocks); + pr_err_ratelimited("extend alloc file! (%llu,%u,%u)\n", + sbi->alloc_file->i_size * 8, + sbi->total_blocks, sbi->free_blocks); return -ENOSPC; } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7d1a675e037d..702a0663b1d8 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap, } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } setattr_copy(&nop_mnt_idmap, inode, attr); @@ -298,7 +298,7 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } @@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, inode->i_ino = sbi->next_cnid++; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); hip = HFSPLUS_I(inode); INIT_LIST_HEAD(&hip->open_dir_list); @@ -521,9 +521,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_get_perms(inode, &folder->permissions, 1); set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); - inode->i_atime = hfsp_mt2ut(folder->access_date); - inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); - inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); + inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date)); + inode_set_mtime_to_ts(inode, + hfsp_mt2ut(folder->content_mod_date)); + inode_set_ctime_to_ts(inode, + hfsp_mt2ut(folder->attribute_mod_date)); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -562,9 +564,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) init_special_inode(inode, inode->i_mode, be32_to_cpu(file->permissions.dev)); } - inode->i_atime = hfsp_mt2ut(file->access_date); - inode->i_mtime = hfsp_mt2ut(file->content_mod_date); - inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); + inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date)); + inode_set_mtime_to_ts(inode, + hfsp_mt2ut(file->content_mod_date)); + inode_set_ctime_to_ts(inode, + hfsp_mt2ut(file->attribute_mod_date)); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); @@ -607,9 +611,9 @@ int hfsplus_cat_write_inode(struct inode *inode) sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ hfsplus_cat_set_perms(inode, &folder->permissions); - folder->access_date = hfsp_ut2mt(inode->i_atime); - folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); - folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + folder->access_date = hfsp_ut2mt(inode_get_atime(inode)); + folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode)); + folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); folder->valence = cpu_to_be32(inode->i_size - 2); if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { folder->subfolders = @@ -642,9 +646,9 @@ int hfsplus_cat_write_inode(struct inode *inode) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); else file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); - file->access_date = hfsp_ut2mt(inode->i_atime); - file->content_mod_date = hfsp_ut2mt(inode->i_mtime); - file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + file->access_date = hfsp_ut2mt(inode_get_atime(inode)); + file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode)); + file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); } @@ -700,7 +704,7 @@ int hfsplus_fileattr_set(struct mnt_idmap *idmap, else hip->userflags &= ~HFSPLUS_FLG_NODUMP; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 58021e73c00b..9c9ff6b8c6f7 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -13,7 +13,7 @@ static int hfsplus_removexattr(struct inode *inode, const char *name); -const struct xattr_handler *hfsplus_xattr_handlers[] = { +const struct xattr_handler * const hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index d14e362b3eba..15cc55e41410 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -17,7 +17,7 @@ extern const struct xattr_handler hfsplus_xattr_user_handler; extern const struct xattr_handler hfsplus_xattr_trusted_handler; extern const struct xattr_handler hfsplus_xattr_security_handler; -extern const struct xattr_handler *hfsplus_xattr_handlers[]; +extern const struct xattr_handler * const hfsplus_xattr_handlers[]; int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 46387090eb76..ea87f24c6c3f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -513,12 +513,15 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st) set_nlink(ino, st->nlink); i_uid_write(ino, st->uid); i_gid_write(ino, st->gid); - ino->i_atime = - (struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec }; - ino->i_mtime = - (struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec }; - ino->i_ctime = - (struct timespec64){ st->ctime.tv_sec, st->ctime.tv_nsec }; + inode_set_atime_to_ts(ino, (struct timespec64){ + st->atime.tv_sec, + st->atime.tv_nsec, + }); + inode_set_mtime_to_ts(ino, (struct timespec64){ + st->mtime.tv_sec, + st->mtime.tv_nsec, + }); + inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec); ino->i_size = st->size; ino->i_blocks = st->blocks; return 0; diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index ec975f466877..ac1e9318e65a 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select BUFFER_HEAD select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index f32f15669996..49dd585c2b17 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -277,14 +277,16 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in * inode. */ - if (!result->i_ctime.tv_sec) { - if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)))) - result->i_ctime.tv_sec = 1; - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)); - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)); - result->i_atime.tv_nsec = 0; + if (!inode_get_ctime_sec(result)) { + time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)); + + inode_set_ctime(result, csec ? csec : 1, 0); + inode_set_mtime(result, + local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)), + 0); + inode_set_atime(result, + local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)), + 0); hpfs_result->i_ea_size = le32_to_cpu(de->ea_size); if (!hpfs_result->i_ea_mode && de->read_only) result->i_mode &= ~0222; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index e50e92a42432..a59e8fa630db 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -36,9 +36,9 @@ void hpfs_init_inode(struct inode *i) hpfs_inode->i_rddir_off = NULL; hpfs_inode->i_dirty = 0; - i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0; - i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0; - i->i_atime.tv_sec = i->i_atime.tv_nsec = 0; + inode_set_ctime(i, 0, 0); + inode_set_mtime(i, 0, 0); + inode_set_atime(i, 0, 0); } void hpfs_read_inode(struct inode *i) @@ -230,9 +230,9 @@ void hpfs_write_inode_nolock(struct inode *i) } hpfs_write_inode_ea(i, fnode); if (de) { - de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); - de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i))); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i))); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i))); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size); hpfs_mark_4buffers_dirty(&qbh); @@ -240,9 +240,9 @@ void hpfs_write_inode_nolock(struct inode *i) } if (S_ISDIR(i->i_mode)) { if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) { - de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); - de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i))); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i))); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i))); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0); de->file_size = cpu_to_le32(0); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 69fb40b2c99a..9184b4584b01 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -12,11 +12,10 @@ static void hpfs_update_directory_times(struct inode *dir) { time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb)); - if (t == dir->i_mtime.tv_sec && - t == dir->i_ctime.tv_sec) + if (t == inode_get_mtime_sec(dir) && + t == inode_get_ctime_sec(dir)) return; - dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t; - dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0; + inode_set_mtime_to_ts(dir, inode_set_ctime(dir, t, 0)); hpfs_write_inode_nolock(dir); } @@ -59,10 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; hpfs_i(result)->i_dno = dno; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; result->i_mode |= S_IFDIR; result->i_op = &hpfs_dir_iops; @@ -167,10 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, result->i_fop = &hpfs_file_ops; set_nlink(result, 1); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; if (dee.read_only) result->i_mode &= ~0222; @@ -250,10 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, hpfs_init_inode(result); result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); @@ -326,10 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_init_inode(result); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; result->i_mode = S_IFLNK | 0777; result->i_uid = current_fsuid(); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 1cb89595b875..6b0ba3c1efba 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -725,12 +725,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (!de) hpfs_error(s, "unable to find root dir"); else { - root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date)); - root->i_atime.tv_nsec = 0; - root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date)); - root->i_mtime.tv_nsec = 0; - root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); - root->i_ctime.tv_nsec = 0; + inode_set_atime(root, + local_to_gmt(s, le32_to_cpu(de->read_date)), + 0); + inode_set_mtime(root, + local_to_gmt(s, le32_to_cpu(de->write_date)), + 0); + inode_set_ctime(root, + local_to_gmt(s, le32_to_cpu(de->creation_date)), + 0); hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7b17ccfa039d..f757d4f7ad98 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { {} }; -#ifdef CONFIG_NUMA -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ - vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, - index); -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ - mpol_cond_put(vma->vm_policy); -} -#else -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ -} -#endif - /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -135,7 +112,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; - ret = seal_check_future_write(info->seals, vma); + ret = seal_check_write(info->seals, vma); if (ret) return ret; @@ -283,6 +260,41 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #endif /* + * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. + * Returns the maximum number of bytes one can read without touching the 1st raw + * HWPOISON subpage. + * + * The implementation borrows the iteration logic from copy_page_to_iter*. + */ +static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) +{ + size_t n = 0; + size_t res = 0; + + /* First subpage to start the loop. */ + page = nth_page(page, offset / PAGE_SIZE); + offset %= PAGE_SIZE; + while (1) { + if (is_raw_hwpoison_page_in_hugepage(page)) + break; + + /* Safe to read n bytes without touching HWPOISON subpage. */ + n = min(bytes, (size_t)PAGE_SIZE - offset); + res += n; + bytes -= n; + if (!bytes || !n) + break; + offset += n; + if (offset == PAGE_SIZE) { + page = nth_page(page, 1); + offset = 0; + } + } + + return res; +} + +/* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read(). */ @@ -299,8 +311,8 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) ssize_t retval = 0; while (iov_iter_count(to)) { - struct page *page; - size_t nr, copied; + struct folio *folio; + size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); @@ -317,28 +329,38 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } nr = nr - offset; - /* Find the page */ - page = find_lock_page(mapping, index); - if (unlikely(page == NULL)) { + /* Find the folio */ + folio = filemap_lock_hugetlb_folio(h, mapping, index); + if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ copied = iov_iter_zero(nr, to); } else { - unlock_page(page); - - if (PageHWPoison(page)) { - put_page(page); - retval = -EIO; - break; + folio_unlock(folio); + + if (!folio_test_has_hwpoisoned(folio)) + want = nr; + else { + /* + * Adjust how many bytes safe to read without + * touching the 1st raw HWPOISON subpage after + * offset. + */ + want = adjust_range_hwpoison(&folio->page, offset, nr); + if (want == 0) { + folio_put(folio); + retval = -EIO; + break; + } } /* - * We have the page, copy it to user space buffer. + * We have the folio, copy it to user space buffer. */ - copied = copy_page_to_iter(page, offset, nr, to); - put_page(page); + copied = copy_folio_to_iter(folio, offset, want, to); + folio_put(folio); } offset += copied; retval += copied; @@ -616,21 +638,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; - const pgoff_t start = lstart >> huge_page_shift(h); - const pgoff_t end = lend >> huge_page_shift(h); + const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); folio_batch_init(&fbatch); - next = start; + next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i]; u32 hash = 0; - index = folio->index; + index = folio->index >> huge_page_order(h); hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -648,7 +669,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } if (truncate_op) - (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); + (void)hugetlb_unreserve_pages(inode, + lstart >> huge_page_shift(h), + LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) @@ -696,7 +719,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, pgoff_t idx = start >> huge_page_shift(h); struct folio *folio; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) return; @@ -807,8 +830,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* * Initialize a pseudo vma as this is required by the huge page - * allocation routines. If NUMA is configured, use page index - * as input to create an allocation policy. + * allocation routines. */ vma_init(&pseudo_vma, mm); vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); @@ -841,7 +863,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - folio = filemap_get_folio(mapping, index); + folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) { folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -856,9 +878,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); - hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); @@ -887,7 +907,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); out: inode_unlock(inode); return error; @@ -935,7 +955,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -979,7 +999,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mapping->private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { @@ -1022,7 +1042,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (!inode) return -ENOSPC; - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; @@ -1054,7 +1074,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_tmpfile(file, inode); return finish_open_simple(file, 0); } @@ -1076,7 +1096,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, } else iput(inode); } - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return error; } @@ -1159,7 +1179,9 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); struct hstate *h = hstate_inode(d_inode(dentry)); + u64 id = huge_encode_dev(dentry->d_sb->s_dev); + buf->f_fsid = u64_to_fsid(id); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); if (sbinfo) { @@ -1237,18 +1259,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } - - /* - * Any time after allocation, hugetlbfs_destroy_inode can be called - * for the inode. mpol_free_shared_policy is unconditionally called - * as part of hugetlbfs_destroy_inode. So, initialize policy here - * in case of a quick call to destroy. - * - * Note that the policy is initialized even if we are creating a - * private inode. This simplifies hugetlbfs_destroy_inode. - */ - mpol_shared_policy_init(&p->policy, NULL); - return &p->vfs_inode; } @@ -1260,7 +1270,6 @@ static void hugetlbfs_free_inode(struct inode *inode) static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); - mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); } static const struct address_space_operations hugetlbfs_aops = { diff --git a/fs/init.c b/fs/init.c index 9684406a8416..e9387b6c4f30 100644 --- a/fs/init.c +++ b/fs/init.c @@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mknod(&path, dentry, mode, dev); if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, @@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mkdir(&path, dentry, mode); if (!error) error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, diff --git a/fs/inode.c b/fs/inode.c index 67611a360031..f238d987dec9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -54,9 +54,9 @@ * inode_hash_lock */ -static unsigned int i_hash_mask __read_mostly; -static unsigned int i_hash_shift __read_mostly; -static struct hlist_head *inode_hashtable __read_mostly; +static unsigned int i_hash_mask __ro_after_init; +static unsigned int i_hash_shift __ro_after_init; +static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* @@ -70,7 +70,7 @@ EXPORT_SYMBOL(empty_aops); static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); -static struct kmem_cache *inode_cachep __read_mostly; +static struct kmem_cache *inode_cachep __ro_after_init; static long get_nr_inodes(void) { @@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) lockdep_set_class_and_name(&mapping->invalidate_lock, &sb->s_type->invalidate_lock_key, "mapping.invalidate_lock"); + if (sb->s_iflags & SB_I_STABLE_WRITES) + mapping_set_stable_writes(mapping); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ @@ -751,16 +753,11 @@ EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on - * @kill_dirty: flag to guide handling of dirty inodes * - * Attempts to free all inodes for a given superblock. If there were any - * busy inodes return a non-zero value, else zero. - * If @kill_dirty is set, discard dirty inodes too, otherwise treat - * them as busy. + * Attempts to free all inodes (including dirty inodes) for a given superblock. */ -int invalidate_inodes(struct super_block *sb, bool kill_dirty) +void invalidate_inodes(struct super_block *sb) { - int busy = 0; struct inode *inode, *next; LIST_HEAD(dispose); @@ -772,14 +769,8 @@ again: spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { - spin_unlock(&inode->i_lock); - busy = 1; - continue; - } if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); - busy = 1; continue; } @@ -797,8 +788,6 @@ again: spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); - - return busy; } /* @@ -1850,25 +1839,29 @@ EXPORT_SYMBOL(bmap); static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { + struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; /* * Is mtime younger than or equal to atime? If yes, update atime: */ - if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) + atime = inode_get_atime(inode); + mtime = inode_get_mtime(inode); + if (timespec64_compare(&mtime, &atime) >= 0) return 1; /* * Is ctime younger than or equal to atime? If yes, update atime: */ - if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) + ctime = inode_get_ctime(inode); + if (timespec64_compare(&ctime, &atime) >= 0) return 1; /* * Is the previous atime value older than a day? If yes, * update atime: */ - if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) + if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) return 1; /* * Good, we can skip the atime update: @@ -1876,29 +1869,79 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) +/** + * inode_update_timestamps - update the timestamps on the inode + * @inode: inode to be updated + * @flags: S_* flags that needed to be updated + * + * The update_time function is called when an inode's timestamps need to be + * updated for a read or write operation. This function handles updating the + * actual timestamps. It's up to the caller to ensure that the inode is marked + * dirty appropriately. + * + * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated, + * attempt to update all three of them. S_ATIME updates can be handled + * independently of the rest. + * + * Returns a set of S_* flags indicating which values changed. + */ +int inode_update_timestamps(struct inode *inode, int flags) { - int dirty_flags = 0; + int updated = 0; + struct timespec64 now; + + if (flags & (S_MTIME|S_CTIME|S_VERSION)) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); - if (flags & (S_ATIME | S_CTIME | S_MTIME)) { - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - - if (inode->i_sb->s_flags & SB_LAZYTIME) - dirty_flags |= I_DIRTY_TIME; - else - dirty_flags |= I_DIRTY_SYNC; + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + if (!timespec64_equal(&now, &mtime)) { + inode_set_mtime_to_ts(inode, now); + updated |= S_MTIME; + } + if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) + updated |= S_VERSION; + } else { + now = current_time(inode); } - if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) - dirty_flags |= I_DIRTY_SYNC; + if (flags & S_ATIME) { + struct timespec64 atime = inode_get_atime(inode); + + if (!timespec64_equal(&now, &atime)) { + inode_set_atime_to_ts(inode, now); + updated |= S_ATIME; + } + } + return updated; +} +EXPORT_SYMBOL(inode_update_timestamps); +/** + * generic_update_time - update the timestamps on the inode + * @inode: inode to be updated + * @flags: S_* flags that needed to be updated + * + * The update_time function is called when an inode's timestamps need to be + * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME, + * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME + * updates can be handled done independently of the rest. + * + * Returns a S_* mask indicating which fields were updated. + */ +int generic_update_time(struct inode *inode, int flags) +{ + int updated = inode_update_timestamps(inode, flags); + int dirty_flags = 0; + + if (updated & (S_ATIME|S_MTIME|S_CTIME)) + dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC; + if (updated & S_VERSION) + dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); - return 0; + return updated; } EXPORT_SYMBOL(generic_update_time); @@ -1906,11 +1949,12 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ -int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) +int inode_update_time(struct inode *inode, int flags) { if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); - return generic_update_time(inode, time, flags); + return inode->i_op->update_time(inode, flags); + generic_update_time(inode, flags); + return 0; } EXPORT_SYMBOL(inode_update_time); @@ -1926,7 +1970,7 @@ EXPORT_SYMBOL(inode_update_time); bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; - struct timespec64 now; + struct timespec64 now, atime; if (inode->i_flags & S_NOATIME) return false; @@ -1952,7 +1996,8 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if (!relatime_need_update(mnt, inode, now)) return false; - if (timespec64_equal(&inode->i_atime, &now)) + atime = inode_get_atime(inode); + if (timespec64_equal(&atime, &now)) return false; return true; @@ -1962,7 +2007,6 @@ void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); - struct timespec64 now; if (!atime_needs_update(path, inode)) return; @@ -1970,7 +2014,7 @@ void touch_atime(const struct path *path) if (!sb_start_write_trylock(inode->i_sb)) return; - if (__mnt_want_write(mnt) != 0) + if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to @@ -1981,9 +2025,8 @@ void touch_atime(const struct path *path) * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ - now = current_time(inode); - inode_update_time(inode, &now, S_ATIME); - __mnt_drop_write(mnt); + inode_update_time(inode, S_ATIME); + mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } @@ -2067,18 +2110,22 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); -static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) +static int inode_needs_update_time(struct inode *inode) { int sync_it = 0; + struct timespec64 now = current_time(inode); + struct timespec64 ts; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - if (!timespec64_equal(&inode->i_mtime, now)) + ts = inode_get_mtime(inode); + if (!timespec64_equal(&ts, &now)) sync_it = S_MTIME; - if (!timespec64_equal(&inode->i_ctime, now)) + ts = inode_get_ctime(inode); + if (!timespec64_equal(&ts, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2087,16 +2134,15 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) return sync_it; } -static int __file_update_time(struct file *file, struct timespec64 *now, - int sync_mode) +static int __file_update_time(struct file *file, int sync_mode) { int ret = 0; struct inode *inode = file_inode(file); /* try to update time settings */ - if (!__mnt_want_write_file(file)) { - ret = inode_update_time(inode, now, sync_mode); - __mnt_drop_write_file(file); + if (!mnt_get_write_access_file(file)) { + ret = inode_update_time(inode, sync_mode); + mnt_put_write_access_file(file); } return ret; @@ -2120,13 +2166,12 @@ int file_update_time(struct file *file) { int ret; struct inode *inode = file_inode(file); - struct timespec64 now = current_time(inode); - ret = inode_needs_update_time(inode, &now); + ret = inode_needs_update_time(inode); if (ret <= 0) return ret; - return __file_update_time(file, &now, ret); + return __file_update_time(file, ret); } EXPORT_SYMBOL(file_update_time); @@ -2149,7 +2194,6 @@ static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); - struct timespec64 now = current_time(inode); /* * Clear the security bits if the process is not being run by root. @@ -2162,13 +2206,13 @@ static int file_modified_flags(struct file *file, int flags) if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - ret = inode_needs_update_time(inode, &now); + ret = inode_needs_update_time(inode); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; - return __file_update_time(file, &now, ret); + return __file_update_time(file, ret); } /** @@ -2488,17 +2532,27 @@ struct timespec64 current_time(struct inode *inode) struct timespec64 now; ktime_get_coarse_real_ts64(&now); - - if (unlikely(!inode->i_sb)) { - WARN(1, "current_time() called with uninitialized super_block in the inode"); - return now; - } - return timestamp_truncate(now, inode); } EXPORT_SYMBOL(current_time); /** + * inode_set_ctime_current - set the ctime to current_time + * @inode: inode + * + * Set the inode->i_ctime to the current value for the inode. Returns + * the current value that was assigned to i_ctime. + */ +struct timespec64 inode_set_ctime_current(struct inode *inode) +{ + struct timespec64 now = current_time(inode); + + inode_set_ctime(inode, now.tv_sec, now.tv_nsec); + return now; +} +EXPORT_SYMBOL(inode_set_ctime_current); + +/** * in_group_or_capable - check whether caller is CAP_FSETID privileged * @idmap: idmap of the mount @inode was found from * @inode: inode to check diff --git a/fs/internal.h b/fs/internal.h index f7a3dc111026..58e43341aebf 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,16 +23,10 @@ struct mnt_idmap; */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); - -void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } -static inline int emergency_thaw_bdev(struct super_block *sb) -{ - return 0; -} #endif /* CONFIG_BLOCK */ /* @@ -79,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern int __mnt_want_write_file(struct file *); -extern void __mnt_drop_write_file(struct file *); +int mnt_get_write_access_file(struct file *file); +void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); @@ -100,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); +void release_empty_file(struct file *f); + +static inline void file_put_write_access(struct file *file) +{ + put_write_access(file->f_inode); + mnt_put_write_access(file->f_path.mnt); + if (unlikely(file->f_mode & FMODE_BACKING)) + mnt_put_write_access(backing_file_user_path(file)->mnt); +} static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { - put_write_access(file->f_inode); - __mnt_drop_write(file->f_path.mnt); + file_put_write_access(file); } } @@ -115,7 +117,7 @@ static inline void put_file_access(struct file *file) * super.c */ extern int reconfigure_super(struct fs_context *); -extern bool trylock_super(struct super_block *sb); +extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); @@ -136,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb) * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in - * __mnt_want_write() before the mnt_is_readonly() check. The barrier - * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already - * cleared, it will see s_readonly_remount set. + * mnt_get_write_access() before the mnt_is_readonly() check. + * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD + * already cleared, it will see s_readonly_remount set. */ smp_wmb(); } @@ -201,7 +203,7 @@ void lock_two_inodes(struct inode *inode1, struct inode *inode2, * fs-writeback.c */ extern long get_nr_dirty_inodes(void); -extern int invalidate_inodes(struct super_block *, bool); +void invalidate_inodes(struct super_block *sb); /* * dcache.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 5b2481cd4750..f5fd99d6b0d4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p) * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ -#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) -#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) -#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { @@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) + if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) @@ -396,8 +397,8 @@ static int ioctl_fsfreeze(struct file *filp) /* Freeze */ if (sb->s_op->freeze_super) - return sb->s_op->freeze_super(sb); - return freeze_super(sb); + return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); + return freeze_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_fsthaw(struct file *filp) @@ -409,8 +410,8 @@ static int ioctl_fsthaw(struct file *filp) /* Thaw */ if (sb->s_op->thaw_super) - return sb->s_op->thaw_super(sb); - return thaw_super(sb); + return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); + return thaw_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_file_dedupe_range(struct file *file, @@ -877,6 +878,9 @@ out: #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation + * @file: The file to operate on. + * @cmd: The ioctl command number. + * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index aa8967cca1a3..f72df2babe56 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -23,65 +23,171 @@ #define IOEND_BATCH_SIZE 4096 +typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); /* - * Structure allocated for each folio when block size < folio size - * to track sub-folio uptodate status and I/O completions. + * Structure allocated for each folio to track per-block uptodate, dirty state + * and I/O completions. */ -struct iomap_page { - atomic_t read_bytes_pending; +struct iomap_folio_state { + spinlock_t state_lock; + unsigned int read_bytes_pending; atomic_t write_bytes_pending; - spinlock_t uptodate_lock; - unsigned long uptodate[]; + + /* + * Each block has two bits in this bitmap: + * Bits [0..blocks_per_folio) has the uptodate status. + * Bits [b_p_f...(2*b_p_f)) has the dirty status. + */ + unsigned long state[]; }; -static inline struct iomap_page *to_iomap_page(struct folio *folio) +static struct bio_set iomap_ioend_bioset; + +static inline bool ifs_is_fully_uptodate(struct folio *folio, + struct iomap_folio_state *ifs) { - if (folio_test_private(folio)) - return folio_get_private(folio); - return NULL; + struct inode *inode = folio->mapping->host; + + return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static struct bio_set iomap_ioend_bioset; +static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, + unsigned int block) +{ + return test_bit(block, ifs->state); +} + +static bool ifs_set_range_uptodate(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int first_blk = off >> inode->i_blkbits; + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + + bitmap_set(ifs->state, first_blk, nr_blks); + return ifs_is_fully_uptodate(folio, ifs); +} + +static void iomap_set_range_uptodate(struct folio *folio, size_t off, + size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + unsigned long flags; + bool uptodate = true; + + if (ifs) { + spin_lock_irqsave(&ifs->state_lock, flags); + uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + spin_unlock_irqrestore(&ifs->state_lock, flags); + } + + if (uptodate) + folio_mark_uptodate(folio); +} + +static inline bool ifs_block_is_dirty(struct folio *folio, + struct iomap_folio_state *ifs, int block) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + + return test_bit(block + blks_per_folio, ifs->state); +} + +static void ifs_clear_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_clear_range_dirty(folio, ifs, off, len); +} + +static void ifs_set_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; -static struct iomap_page * -iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_dirty(folio, ifs, off, len); +} + +static struct iomap_folio_state *ifs_alloc(struct inode *inode, + struct folio *folio, unsigned int flags) +{ + struct iomap_folio_state *ifs = folio->private; unsigned int nr_blocks = i_blocks_per_folio(inode, folio); gfp_t gfp; - if (iop || nr_blocks <= 1) - return iop; + if (ifs || nr_blocks <= 1) + return ifs; if (flags & IOMAP_NOWAIT) gfp = GFP_NOWAIT; else gfp = GFP_NOFS | __GFP_NOFAIL; - iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), - gfp); - if (iop) { - spin_lock_init(&iop->uptodate_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(iop->uptodate, nr_blocks); - folio_attach_private(folio, iop); - } - return iop; + /* + * ifs->state tracks two sets of state flags when the + * filesystem block size is smaller than the folio size. + * The first state tracks per-block uptodate and the + * second tracks per-block dirty state. + */ + ifs = kzalloc(struct_size(ifs, state, + BITS_TO_LONGS(2 * nr_blocks)), gfp); + if (!ifs) + return ifs; + + spin_lock_init(&ifs->state_lock); + if (folio_test_uptodate(folio)) + bitmap_set(ifs->state, 0, nr_blocks); + if (folio_test_dirty(folio)) + bitmap_set(ifs->state, nr_blocks, nr_blocks); + folio_attach_private(folio, ifs); + + return ifs; } -static void iomap_page_release(struct folio *folio) +static void ifs_free(struct folio *folio) { - struct iomap_page *iop = folio_detach_private(folio); - struct inode *inode = folio->mapping->host; - unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + struct iomap_folio_state *ifs = folio_detach_private(folio); - if (!iop) + if (!ifs) return; - WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); - WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); - WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + WARN_ON_ONCE(ifs->read_bytes_pending != 0); + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); + WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != folio_test_uptodate(folio)); - kfree(iop); + kfree(ifs); } /* @@ -90,7 +196,7 @@ static void iomap_page_release(struct folio *folio) static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, loff_t *pos, loff_t length, size_t *offp, size_t *lenp) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; loff_t orig_pos = *pos; loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; @@ -105,12 +211,12 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ - if (iop) { + if (ifs) { unsigned int i; /* move forward for each leading block marked uptodate */ for (i = first; i <= last; i++) { - if (!test_bit(i, iop->uptodate)) + if (!ifs_block_is_uptodate(ifs, i)) break; *pos += block_size; poff += block_size; @@ -120,7 +226,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, /* truncate len if we find any trailing uptodate block(s) */ for ( ; i <= last; i++) { - if (test_bit(i, iop->uptodate)) { + if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; break; @@ -144,44 +250,28 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, *lenp = plen; } -static void iomap_iop_set_range_uptodate(struct folio *folio, - struct iomap_page *iop, size_t off, size_t len) -{ - struct inode *inode = folio->mapping->host; - unsigned first = off >> inode->i_blkbits; - unsigned last = (off + len - 1) >> inode->i_blkbits; - unsigned long flags; - - spin_lock_irqsave(&iop->uptodate_lock, flags); - bitmap_set(iop->uptodate, first, last - first + 1); - if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio))) - folio_mark_uptodate(folio); - spin_unlock_irqrestore(&iop->uptodate_lock, flags); -} - -static void iomap_set_range_uptodate(struct folio *folio, - struct iomap_page *iop, size_t off, size_t len) -{ - if (iop) - iomap_iop_set_range_uptodate(folio, iop, off, len); - else - folio_mark_uptodate(folio); -} - -static void iomap_finish_folio_read(struct folio *folio, size_t offset, +static void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error) { - struct iomap_page *iop = to_iomap_page(folio); - - if (unlikely(error)) { - folio_clear_uptodate(folio); - folio_set_error(folio); - } else { - iomap_set_range_uptodate(folio, iop, offset, len); + struct iomap_folio_state *ifs = folio->private; + bool uptodate = !error; + bool finished = true; + + if (ifs) { + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + if (!error) + uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + ifs->read_bytes_pending -= len; + finished = !ifs->read_bytes_pending; + spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending)) - folio_unlock(folio); + if (error) + folio_set_error(folio); + if (finished) + folio_end_read(folio, uptodate); } static void iomap_read_end_io(struct bio *bio) @@ -213,7 +303,6 @@ struct iomap_readpage_ctx { static int iomap_read_inline_data(const struct iomap_iter *iter, struct folio *folio) { - struct iomap_page *iop; const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); @@ -231,15 +320,13 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) - iop = iomap_page_create(iter->inode, folio, iter->flags); - else - iop = to_iomap_page(folio); + ifs_alloc(iter->inode, folio, iter->flags); addr = kmap_local_folio(folio, offset); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_local(addr); - iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff); + iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff); return 0; } @@ -260,7 +347,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, loff_t pos = iter->pos + offset; loff_t length = iomap_length(iter) - offset; struct folio *folio = ctx->cur_folio; - struct iomap_page *iop; + struct iomap_folio_state *ifs; loff_t orig_pos = pos; size_t poff, plen; sector_t sector; @@ -269,20 +356,23 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, folio, iter->flags); + ifs = ifs_alloc(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; if (iomap_block_needs_zeroing(iter, pos)) { folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, iop, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); goto done; } ctx->cur_folio_in_bio = true; - if (iop) - atomic_add(plen, &iop->read_bytes_pending); + if (ifs) { + spin_lock_irq(&ifs->state_lock); + ifs->read_bytes_pending += plen; + spin_unlock_irq(&ifs->state_lock); + } sector = iomap_sector(iomap, pos); if (!ctx->bio || @@ -436,11 +526,11 @@ EXPORT_SYMBOL_GPL(iomap_readahead); */ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; unsigned first, last, i; - if (!iop) + if (!ifs) return false; /* Caller's range may extend past the end of this folio */ @@ -451,7 +541,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) last = (from + count - 1) >> inode->i_blkbits; for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) + if (!ifs_block_is_uptodate(ifs, i)) return false; return true; } @@ -461,16 +551,18 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); * iomap_get_folio - get a folio reference for writing * @iter: iteration structure * @pos: start offset of write + * @len: Suggested size of folio to create. * * Returns a locked reference to the folio at @pos, or an error pointer if the * folio could not be obtained. */ -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { - unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; + fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); @@ -483,14 +575,13 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) folio_size(folio)); /* - * mm accommodates an old ext3 case where clean folios might - * not have had the dirty bit cleared. Thus, it can send actual - * dirty folios to ->release_folio() via shrink_active_list(); - * skip those here. + * If the folio is dirty, we refuse to release our metadata because + * it may be partially dirty. Once we track per-block dirty state, + * we can release the metadata if every block is dirty. */ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) + if (folio_test_dirty(folio)) return false; - iomap_page_release(folio); + ifs_free(folio); return true; } EXPORT_SYMBOL_GPL(iomap_release_folio); @@ -507,16 +598,22 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) if (offset == 0 && len == folio_size(folio)) { WARN_ON_ONCE(folio_test_writeback(folio)); folio_cancel_dirty(folio); - iomap_page_release(folio); - } else if (folio_test_large(folio)) { - /* Must release the iop so the page can be split */ - WARN_ON_ONCE(!folio_test_uptodate(folio) && - folio_test_dirty(folio)); - iomap_page_release(folio); + ifs_free(folio); } } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + size_t len = folio_size(folio); + + ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, len); + return filemap_dirty_folio(mapping, folio); +} +EXPORT_SYMBOL_GPL(iomap_dirty_folio); + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -547,7 +644,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop; + struct iomap_folio_state *ifs; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); @@ -555,14 +652,25 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; - if (folio_test_uptodate(folio)) + /* + * If the write or zeroing completely overlaps the current folio, then + * entire folio will be dirtied so there is no need for + * per-block state tracking structures to be attached to this folio. + * For the unshare case, we must read in the ondisk contents because we + * are not changing pagecache contents. + */ + if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && + pos + len >= folio_pos(folio) + folio_size(folio)) return 0; - folio_clear_error(folio); - iop = iomap_page_create(iter->inode, folio, iter->flags); - if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) + ifs = ifs_alloc(iter->inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) return -EAGAIN; + if (folio_test_uptodate(folio)) + return 0; + folio_clear_error(folio); + do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); @@ -589,7 +697,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (status) return status; } - iomap_set_range_uptodate(folio, iop, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); } while ((block_start += plen) < block_end); return 0; @@ -603,7 +711,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); else - return iomap_get_folio(iter, pos); + return iomap_get_folio(iter, pos, len); } static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, @@ -696,7 +804,6 @@ out_unlock: static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { - struct iomap_page *iop = to_iomap_page(folio); flush_dcache_folio(folio); /* @@ -712,7 +819,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, */ if (unlikely(copied < len && !folio_test_uptodate(folio))) return 0; - iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len); + iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); + iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); return copied; } @@ -773,6 +881,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); + size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; ssize_t written = 0; long status = 0; @@ -781,15 +890,14 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; - struct page *page; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ + size_t offset; /* Offset into folio */ + size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - offset = offset_in_page(pos); - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_count(i)); -again: + bytes = iov_iter_count(i); +retry: + offset = pos & (chunk - 1); + bytes = min(chunk - offset, bytes); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) @@ -819,12 +927,14 @@ again: if (iter->iomap.flags & IOMAP_F_STALE) break; - page = folio_file_page(folio, pos >> PAGE_SHIFT); - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; - copied = copy_page_from_iter_atomic(page, offset, bytes, i); + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); status = iomap_write_end(iter, pos, bytes, copied, folio); if (unlikely(copied != status)) @@ -838,13 +948,17 @@ again: * halfway through, might be a race with munmap, * might be severe memory pressure. */ - if (copied) + if (chunk > PAGE_SIZE) + chunk /= 2; + if (copied) { bytes = copied; - goto again; + goto retry; + } + } else { + pos += status; + written += status; + length -= status; } - pos += status; - written += status; - length -= status; } while (iov_iter_count(i) && length); if (status == -EAGAIN) { @@ -880,9 +994,79 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static int iomap_write_delalloc_ifs_punch(struct inode *inode, + struct folio *folio, loff_t start_byte, loff_t end_byte, + iomap_punch_t punch) +{ + unsigned int first_blk, last_blk, i; + loff_t last_byte; + u8 blkbits = inode->i_blkbits; + struct iomap_folio_state *ifs; + int ret = 0; + + /* + * When we have per-block dirty tracking, there can be + * blocks within a folio which are marked uptodate + * but not dirty. In that case it is necessary to punch + * out such blocks to avoid leaking any delalloc blocks. + */ + ifs = folio->private; + if (!ifs) + return ret; + + last_byte = min_t(loff_t, end_byte - 1, + folio_pos(folio) + folio_size(folio) - 1); + first_blk = offset_in_folio(folio, start_byte) >> blkbits; + last_blk = offset_in_folio(folio, last_byte) >> blkbits; + for (i = first_blk; i <= last_blk; i++) { + if (!ifs_block_is_dirty(folio, ifs, i)) { + ret = punch(inode, folio_pos(folio) + (i << blkbits), + 1 << blkbits); + if (ret) + return ret; + } + } + + return ret; +} + + +static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + iomap_punch_t punch) +{ + int ret = 0; + + if (!folio_test_dirty(folio)) + return ret; + + /* if dirty, punch up to offset */ + if (start_byte > *punch_start_byte) { + ret = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (ret) + return ret; + } + + /* Punch non-dirty blocks within folio */ + ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, + end_byte, punch); + if (ret) + return ret; + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_pos(folio) + folio_size(folio)); + + return ret; +} + /* * Scan the data range passed to us for dirty page cache folios. If we find a - * dirty folio, punch out the preceeding range and update the offset from which + * dirty folio, punch out the preceding range and update the offset from which * the next punch will start from. * * We can punch out storage reservations under clean pages because they either @@ -899,10 +1083,11 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write); */ static int iomap_write_delalloc_scan(struct inode *inode, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, - int (*punch)(struct inode *inode, loff_t offset, loff_t length)) + iomap_punch_t punch) { while (start_byte < end_byte) { struct folio *folio; + int ret; /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, @@ -913,26 +1098,12 @@ static int iomap_write_delalloc_scan(struct inode *inode, continue; } - /* if dirty, punch up to offset */ - if (folio_test_dirty(folio)) { - if (start_byte > *punch_start_byte) { - int error; - - error = punch(inode, *punch_start_byte, - start_byte - *punch_start_byte); - if (error) { - folio_unlock(folio); - folio_put(folio); - return error; - } - } - - /* - * Make sure the next punch start is correctly bound to - * the end of this data range, not the end of the folio. - */ - *punch_start_byte = min_t(loff_t, end_byte, - folio_next_index(folio) << PAGE_SHIFT); + ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte, + start_byte, end_byte, punch); + if (ret) { + folio_unlock(folio); + folio_put(folio); + return ret; } /* move offset to start of next folio in range */ @@ -977,8 +1148,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, * the code to subtle off-by-one bugs.... */ static int iomap_write_delalloc_release(struct inode *inode, - loff_t start_byte, loff_t end_byte, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)) + loff_t start_byte, loff_t end_byte, iomap_punch_t punch) { loff_t punch_start_byte = start_byte; loff_t scan_end_byte = min(i_size_read(inode), end_byte); @@ -1071,8 +1241,7 @@ out_unlock: */ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, - ssize_t written, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)) + ssize_t written, iomap_punch_t punch) { loff_t start_byte; loff_t end_byte; @@ -1111,7 +1280,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); - long status = 0; loff_t written = 0; /* don't bother with blocks that are not shared to start with */ @@ -1122,28 +1290,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; do { - unsigned long offset = offset_in_page(pos); - unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct folio *folio; + int status; + size_t offset; + size_t bytes = min_t(u64, SIZE_MAX, length); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; - if (iter->iomap.flags & IOMAP_F_STALE) + if (iomap->flags & IOMAP_F_STALE) break; - status = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(status == 0)) + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(bytes == 0)) return -EIO; cond_resched(); - pos += status; - written += status; - length -= status; + pos += bytes; + written += bytes; + length -= bytes; balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length); + } while (length > 0); return written; } @@ -1286,24 +1459,24 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len, int error) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; if (error) { folio_set_error(folio); mapping_set_error(inode->i_mapping, error); } - WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); - if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) folio_end_writeback(folio); } @@ -1570,7 +1743,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, */ static void iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - struct iomap_page *iop, struct iomap_writepage_ctx *wpc, + struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct list_head *iolist) { sector_t sector = iomap_sector(&wpc->iomap, pos); @@ -1588,8 +1761,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); } - if (iop) - atomic_add(len, &iop->write_bytes_pending); + if (ifs) + atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, &folio->page, len); } @@ -1615,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, folio, 0); + struct iomap_folio_state *ifs = folio->private; struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); @@ -1623,7 +1796,14 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); - WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); + WARN_ON_ONCE(end_pos <= pos); + + if (!ifs && nblocks > 1) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); /* * Walk through the folio to find areas to write back. If we @@ -1631,7 +1811,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * invalid, grab a new one. */ for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (iop && !test_bit(i, iop->uptodate)) + if (ifs && !ifs_block_is_dirty(folio, ifs, i)) continue; error = wpc->ops->map_blocks(wpc, inode, pos); @@ -1642,7 +1822,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, continue; if (wpc->iomap.type == IOMAP_HOLE) continue; - iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc, + iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc, &submit_list); count++; } @@ -1675,6 +1855,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, } } + /* + * We can have dirty bits set past end of file in page_mkwrite path + * while mapping the last partial folio. Hence it's better to clear + * all the dirty bits in the folio here. + */ + iomap_clear_range_dirty(folio, 0, folio_size(folio)); folio_start_writeback(folio); folio_unlock(folio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index ea3b868c8355..bcd3f8cf5ea4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -20,10 +20,12 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_WRITE_FUA (1 << 28) -#define IOMAP_DIO_NEED_SYNC (1 << 29) -#define IOMAP_DIO_WRITE (1 << 30) -#define IOMAP_DIO_DIRTY (1 << 31) +#define IOMAP_DIO_CALLER_COMP (1U << 26) +#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_WRITE_THROUGH (1U << 28) +#define IOMAP_DIO_NEED_SYNC (1U << 29) +#define IOMAP_DIO_WRITE (1U << 30) +#define IOMAP_DIO_DIRTY (1U << 31) struct iomap_dio { struct kiocb *iocb; @@ -41,7 +43,6 @@ struct iomap_dio { struct { struct iov_iter *iter; struct task_struct *waiter; - struct bio *poll_bio; } submit; /* used for aio completion: */ @@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { + struct kiocb *iocb = dio->iocb; + atomic_inc(&dio->ref); /* Sync dio can't be polled reliably */ - if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { - bio_set_polled(bio, dio->iocb); - dio->submit.poll_bio = bio; + if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) { + bio_set_polled(bio, iocb); + WRITE_ONCE(iocb->private, bio); } if (dio->dops && dio->dops->submit_io) @@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); +static ssize_t iomap_dio_deferred_complete(void *data) +{ + return iomap_dio_complete(data); +} + static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + struct kiocb *iocb = dio->iocb; if (bio->bi_status) iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + if (!atomic_dec_and_test(&dio->ref)) + goto release_bio; - if (atomic_dec_and_test(&dio->ref)) { - if (dio->wait_for_completion) { - struct task_struct *waiter = dio->submit.waiter; - WRITE_ONCE(dio->submit.waiter, NULL); - blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_WRITE) { - struct inode *inode = file_inode(dio->iocb->ki_filp); - - WRITE_ONCE(dio->iocb->private, NULL); - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); - } else { - WRITE_ONCE(dio->iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } + /* + * Synchronous dio, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ + if (dio->wait_for_completion) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + blk_wake_io_task(waiter); + goto release_bio; + } + + /* + * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline + */ + if (dio->flags & IOMAP_DIO_INLINE_COMP) { + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); + goto release_bio; + } + + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule + * our completion that way to avoid an async punt to a workqueue. + */ + if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* only polled IO cares about private cleared */ + iocb->private = dio; + iocb->dio_complete = iomap_dio_deferred_complete; + + /* + * Invoke ->ki_complete() directly. We've assigned our + * dio_complete callback handler, and since the issuer set + * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will + * notice ->dio_complete being set and will defer calling that + * handler until it can be done from a safe task context. + * + * Note that the 'res' being passed in here is not important + * for this case. The actual completion value of the request + * will be gotten from dio_complete when that is run by the + * issuer. + */ + iocb->ki_complete(iocb, 0); + goto release_bio; } + /* + * Async DIO completion that requires filesystem level completion work + * gets punted to a work queue to complete as the operation may require + * more IO to be issued to finalise filesystem metadata changes or + * guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, + &dio->aio.work); +release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, /* * Figure out the bio's operation flags from the dio request, the * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_FUA flag in the dio request. + * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, const struct iomap *iomap, bool use_fua) @@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, if (use_fua) opflags |= REQ_FUA; else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; return opflags; } @@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * Use a FUA write if we need datasync semantics, this is a pure * data IO that doesn't require any metadata updates (including * after IO completion such as unwritten extent conversion) and - * the underlying device supports FUA. This allows us to avoid - * cache flushes on IO completion. + * the underlying device either supports FUA or doesn't have + * a volatile write cache. This allows us to avoid cache flushes + * on IO completion. If we can't use writethrough and need to + * sync, disable in-task completions as dio completion will + * need to call generic_write_sync() which will do a blocking + * fsync / cache flush call. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) + (dio->flags & IOMAP_DIO_WRITE_THROUGH) && + (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) use_fua = true; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; } /* @@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; /* - * We can only poll for single bio I/Os. + * We can only do deferred completion for pure overwrites that + * don't require additional IO at completion. This rules out + * writes that need zeroing or extent conversion, extend + * the file size, or issue journal IO or cache flushes + * during completion processing. */ if (need_zeroout || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + + /* + * The rules for polled IO completions follow the guidelines as the + * ones we set for inline and deferred completions. If none of those + * are available for this IO, clear the polled flag. + */ + if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.iter = iter; dio->submit.waiter = current; - dio->submit.poll_bio = NULL; if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; if (iov_iter_rw(iter) == READ) { + /* reads can always complete inline */ + dio->flags |= IOMAP_DIO_INLINE_COMP; + if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; + /* + * Flag as supporting deferred completions, if the issuer + * groks it. This can avoid a workqueue punt for writes. + * We may later clear this flag if we need to do other IO + * as part of this IO completion. + */ + if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) + dio->flags |= IOMAP_DIO_CALLER_COMP; + if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_NEED_SYNC; /* - * For datasync only writes, we optimistically try - * using FUA for this IO. Any non-FUA write that - * occurs will clear this flag, hence we know before - * completion whether a cache flush is necessary. + * For datasync only writes, we optimistically try using + * WRITE_THROUGH for this IO. This flag requires either + * FUA writes through the device's write cache, or a + * normal write to a device without a volatile write + * cache. For the former, Any non-FUA write that occurs + * will clear this flag, hence we know before completion + * whether a cache flush is necessary. */ if (!(iocb->ki_flags & IOCB_SYNC)) - dio->flags |= IOMAP_DIO_WRITE_FUA; + dio->flags |= IOMAP_DIO_WRITE_THROUGH; } /* @@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomap_dio_set_error(dio, ret); /* - * If all the writes we issued were FUA, we don't need to flush the - * cache on IO completion. Clear the sync flag for this case. + * If all the writes we issued were already written through to the + * media, we don't need to flush the cache on IO completion. Clear the + * sync flag for this case. */ - if (dio->flags & IOMAP_DIO_WRITE_FUA) + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; - WRITE_ONCE(iocb->private, dio->submit.poll_bio); - /* * We are about to drop our additional submission reference, which * might be the last reference to the dio. There are three different diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 08ffd37b9bb8..51434f2a471b 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISO9660_FS tristate "ISO 9660 CDROM file system support" + select BUFFER_HEAD help This is the standard file system used on CD-ROMs. It was previously known as "High Sierra File System" and is called "hsfs" on other diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index df9d70588b60..3e4d53e26f94 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1422,13 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_ino, de->flags[-high_sierra]); } #endif - - inode->i_mtime.tv_sec = - inode->i_atime.tv_sec = - inode->i_ctime.tv_sec = iso_date(de->date, high_sierra); - inode->i_mtime.tv_nsec = - inode->i_atime.tv_nsec = - inode->i_ctime.tv_nsec = 0; + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0))); ei->i_first_extent = (isonum_733(de->extent) + isonum_711(de->ext_attr_length)); diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 48f58c6c9e69..d6c17ad69dee 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -421,28 +421,24 @@ repeat: /* Rock ridge never appears on a High Sierra disk */ cnt = 0; if (rr->u.TF.flags & TF_CREATE) { - inode->i_ctime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } if (rr->u.TF.flags & TF_MODIFY) { - inode->i_mtime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_mtime.tv_nsec = 0; + inode_set_mtime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } if (rr->u.TF.flags & TF_ACCESS) { - inode->i_atime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_atime.tv_nsec = 0; + inode_set_atime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } if (rr->u.TF.flags & TF_ATTRIBUTES) { - inode->i_ctime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } break; case SIG('S', 'L'): @@ -533,9 +529,9 @@ repeat: inode->i_rdev = reloc->i_rdev; inode->i_size = reloc->i_size; inode->i_blocks = reloc->i_blocks; - inode->i_atime = reloc->i_atime; - inode->i_ctime = reloc->i_ctime; - inode->i_mtime = reloc->i_mtime; + inode_set_atime_to_ts(inode, inode_get_atime(reloc)); + inode_set_ctime_to_ts(inode, inode_get_ctime(reloc)); + inode_set_mtime_to_ts(inode, inode_get_mtime(reloc)); iput(reloc); break; #ifdef CONFIG_ZISOFS diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9ec91017a7f3..118699fff2f9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -41,18 +41,6 @@ static inline void __buffer_unlink(struct journal_head *jh) } /* - * Check a checkpoint buffer could be release or not. - * - * Requires j_list_lock - */ -static inline bool __cp_buffer_busy(struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); -} - -/* * __jbd2_log_wait_for_space: wait until there is space in the journal. * * Called under j-state_lock *only*. It will be unlocked if we have to wait @@ -349,6 +337,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ +enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; + /* * journal_shrink_one_cp_list * @@ -360,7 +350,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - bool destroy, bool *released) + enum shrink_type type, + bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; @@ -376,12 +367,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (destroy) { + if (type == SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); - if (ret < 0) - continue; + if (ret < 0) { + if (type == SHRINK_BUSY_SKIP) + continue; + break; + } } nr_freed++; @@ -445,7 +439,7 @@ again: tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - false, &released); + SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) @@ -485,19 +479,21 @@ out: void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; + enum shrink_type type; bool released; transaction = journal->j_checkpoint_transactions; if (!transaction) return; + type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; journal_shrink_one_cp_list(transaction->t_checkpoint_list, - destroy, &released); + type, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -631,6 +627,8 @@ int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + if (jh->b_transaction) + return -EBUSY; if (!trylock_buffer(bh)) return -EBUSY; if (buffer_dirty(bh)) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1073259902a6..5e122586e06e 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -119,7 +119,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; struct timespec64 now; - blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC; + blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; *cbh = NULL; @@ -270,6 +270,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, if (!ret) ret = err; } + cond_resched(); spin_lock(&journal->j_list_lock); jinode->i_flags &= ~JI_COMMIT_RUNNING; smp_mb(); @@ -298,14 +299,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal, static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) { - struct page *page = bh->b_page; char *addr; __u32 checksum; - addr = kmap_atomic(page); - checksum = crc32_be(crc32_sum, - (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); - kunmap_atomic(addr); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + checksum = crc32_be(crc32_sum, addr, bh->b_size); + kunmap_local(addr); return checksum; } @@ -322,7 +321,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; - struct page *page = bh->b_page; __u8 *addr; __u32 csum32; __be32 seq; @@ -331,11 +329,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, return; seq = cpu_to_be32(sequence); - addr = kmap_atomic(page); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), - bh->b_size); - kunmap_atomic(addr); + csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + kunmap_local(addr); if (jbd2_has_feature_csum3(j)) tag3->t_checksum = cpu_to_be32(csum32); @@ -399,8 +396,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, - REQ_SYNC); + journal->j_tail, 0); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd2_debug(3, "superblock not updated\n"); @@ -719,6 +715,7 @@ start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; + /* * Compute checksum. */ @@ -731,7 +728,8 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, + bh); } cond_resched(); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fbce16fedaa4..206cb53ef2b0 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -115,14 +115,6 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3_feature(j)) - return 1; - - return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; -} - static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) { __u32 csum; @@ -341,7 +333,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; - struct page *new_page; + struct folio *new_folio; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); journal_t *journal = transaction->t_journal; @@ -370,14 +362,14 @@ repeat: */ if (jh_in->b_frozen_data) { done_copy_out = 1; - new_page = virt_to_page(jh_in->b_frozen_data); - new_offset = offset_in_page(jh_in->b_frozen_data); + new_folio = virt_to_folio(jh_in->b_frozen_data); + new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); } else { - new_page = jh2bh(jh_in)->b_page; - new_offset = offset_in_page(jh2bh(jh_in)->b_data); + new_folio = jh2bh(jh_in)->b_folio; + new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data); } - mapped_data = kmap_atomic(new_page); + mapped_data = kmap_local_folio(new_folio, new_offset); /* * Fire data frozen trigger if data already wasn't frozen. Do this * before checking for escaping, as the trigger may modify the magic @@ -385,18 +377,17 @@ repeat: * data in the buffer. */ if (!done_copy_out) - jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); /* * Check for escaping */ - if (*((__be32 *)(mapped_data + new_offset)) == - cpu_to_be32(JBD2_MAGIC_NUMBER)) { + if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) { need_copy_out = 1; do_escape = 1; } - kunmap_atomic(mapped_data); + kunmap_local(mapped_data); /* * Do we need to do a data copy? @@ -417,12 +408,10 @@ repeat: } jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, bh_in->b_size); - kunmap_atomic(mapped_data); + memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); - new_page = virt_to_page(tmp); - new_offset = offset_in_page(tmp); + new_folio = virt_to_folio(tmp); + new_offset = offset_in_folio(new_folio, tmp); done_copy_out = 1; /* @@ -438,12 +427,12 @@ repeat: * copying, we can finally do so. */ if (do_escape) { - mapped_data = kmap_atomic(new_page); - *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data); + mapped_data = kmap_local_folio(new_folio, new_offset); + *((unsigned int *)mapped_data) = 0; + kunmap_local(mapped_data); } - set_bh_page(new_bh, new_page, new_offset); + folio_set_bh(new_bh, new_folio, new_offset); new_bh->b_size = bh_in->b_size; new_bh->b_bdev = journal->j_dev; new_bh->b_blocknr = blocknr; @@ -1111,8 +1100,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, - REQ_SYNC | REQ_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); if (ret) goto out; @@ -1301,7 +1289,7 @@ static int jbd2_min_tag_size(void) static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long nr_to_scan = sc->nr_to_scan; unsigned long nr_shrunk; unsigned long count; @@ -1327,7 +1315,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long count; count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); @@ -1337,6 +1325,189 @@ static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, } /* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock(journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Check the superblock for a given journal, performing initial + * validation of the format. + */ +static int journal_check_superblock(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + int num_fc_blks; + int err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); + return err; + } + + if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && + be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); + return err; + } + + if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { + printk(KERN_WARNING "JBD2: journal file too short\n"); + return err; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_total_len) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + return err; + } + + /* + * If this is a V2 superblock, then we have to check the + * features flags on it. + */ + if (!jbd2_format_support_feature(journal)) + return 0; + + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { + printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); + return err; + } + + num_fc_blks = jbd2_has_feature_fast_commit(journal) ? + jbd2_journal_get_num_fc_blks(sb) : 0; + if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS || + be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) { + printk(KERN_ERR "JBD2: journal file too short %u,%d\n", + be32_to_cpu(sb->s_maxlen), num_fc_blks); + return err; + } + + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + return err; + } + + if (jbd2_journal_has_csum_v2or3_feature(journal) && + jbd2_has_feature_checksum(journal)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + return err; + } + + /* Load the checksum driver */ + if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { + printk(KERN_ERR "JBD2: Unknown checksum type\n"); + return err; + } + + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + err = PTR_ERR(journal->j_chksum_driver); + journal->j_chksum_driver = NULL; + return err; + } + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + return err; + } + } + + return 0; +} + +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ +static int journal_load_superblock(journal_t *journal) +{ + int err; + struct buffer_head *bh; + journal_superblock_t *sb; + + bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, + journal->j_blocksize); + if (bh) + err = bh_read(bh, 0); + if (!bh || err < 0) { + pr_err("%s: Cannot read journal superblock\n", __func__); + brelse(bh); + return -EIO; + } + + journal->j_sb_buffer = bh; + sb = (journal_superblock_t *)bh->b_data; + journal->j_superblock = sb; + err = journal_check_superblock(journal); + if (err) { + journal_fail_superblock(journal); + return err; + } + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); + + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + + if (jbd2_has_feature_fast_commit(journal)) { + journal->j_fc_last = be32_to_cpu(sb->s_maxlen); + journal->j_last = journal->j_fc_last - + jbd2_journal_get_num_fc_blks(sb); + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + } + + return 0; +} + + +/* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing * journal blocks from disk. */ @@ -1352,12 +1523,21 @@ static journal_t *journal_init_common(struct block_device *bdev, static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; - struct buffer_head *bh; int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) - return NULL; + return ERR_PTR(-ENOMEM); + + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_total_len = len; + + err = journal_load_superblock(journal); + if (err) + goto err_cleanup; init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_done_commit); @@ -1370,12 +1550,15 @@ static journal_t *journal_init_common(struct block_device *bdev, mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_history_lock); rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; journal->j_max_batch_time = 15000; /* 15ms */ atomic_set(&journal->j_reserved_credits, 0); + lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", + &jbd2_trans_commit_key, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1385,18 +1568,11 @@ static journal_t *journal_init_common(struct block_device *bdev, if (err) goto err_cleanup; - spin_lock_init(&journal->j_history_lock); - - lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", - &jbd2_trans_commit_key, 0); - - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_total_len = len; - /* We need enough buffers to write out full descriptor block. */ + /* + * journal descriptor can store up to n blocks, we need enough + * buffers to write out full descriptor block. + */ + err = -ENOMEM; n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_fc_wbuf = NULL; @@ -1405,37 +1581,39 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal->j_wbuf) goto err_cleanup; - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - pr_err("%s: Cannot get buffer for journal superblock\n", - __func__); + err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0, + GFP_KERNEL); + if (err) goto err_cleanup; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; journal->j_shrink_transaction = NULL; - journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; - journal->j_shrinker.count_objects = jbd2_journal_shrink_count; - journal->j_shrinker.seeks = DEFAULT_SEEKS; - journal->j_shrinker.batch = journal->j_max_transaction_buffers; - - if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) - goto err_cleanup; - if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) { - percpu_counter_destroy(&journal->j_checkpoint_jh_count); + journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + if (!journal->j_shrinker) { + err = -ENOMEM; goto err_cleanup; } + + journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker->count_objects = jbd2_journal_shrink_count; + journal->j_shrinker->batch = journal->j_max_transaction_buffers; + journal->j_shrinker->private_data = journal; + + shrinker_register(journal->j_shrinker); + return journal; err_cleanup: - brelse(journal->j_sb_buffer); + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + if (journal->j_chksum_driver) + crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); + journal_fail_superblock(journal); kfree(journal); - return NULL; + return ERR_PTR(err); } /* jbd2_journal_init_dev and jbd2_journal_init_inode: @@ -1468,8 +1646,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, journal_t *journal; journal = journal_init_common(bdev, fs_dev, start, len, blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); snprintf(journal->j_devname, sizeof(journal->j_devname), "%pg", journal->j_dev); @@ -1495,11 +1673,9 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) blocknr = 0; err = bmap(inode, &blocknr); - if (err || !blocknr) { - pr_err("%s: Cannot locate journal superblock\n", - __func__); - return NULL; + pr_err("%s: Cannot locate journal superblock\n", __func__); + return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", @@ -1509,8 +1685,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), @@ -1522,18 +1698,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) } /* - * If the journal init or create aborts, we need to mark the journal - * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. - */ -static void journal_fail_superblock(journal_t *journal) -{ - struct buffer_head *bh = journal->j_sb_buffer; - brelse(bh); - journal->j_sb_buffer = NULL; -} - -/* * Given a journal_t structure, initialise the various fields for * startup of a new journaling session. We use this both when creating * a journal, and after recovering an old journal to reset it for @@ -1610,8 +1774,7 @@ static int journal_reset(journal_t *journal) */ jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, - REQ_SYNC | REQ_FUA); + journal->j_tail, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1633,9 +1796,16 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) return -EIO; } - trace_jbd2_write_superblock(journal, write_flags); + /* + * Always set high priority flags to exempt from block layer's + * QOS policies, e.g. writeback throttle. + */ + write_flags |= JBD2_JOURNAL_REQ_FLAGS; if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); + + trace_jbd2_write_superblock(journal, write_flags); + if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1885,167 +2055,10 @@ void jbd2_journal_update_sb_errno(journal_t *journal) jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); sb->s_errno = cpu_to_be32(errcode); - jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); + jbd2_write_superblock(journal, REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); -static int journal_revoke_records_per_block(journal_t *journal) -{ - int record_size; - int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); - - if (jbd2_has_feature_64bit(journal)) - record_size = 8; - else - record_size = 4; - - if (jbd2_journal_has_csum_v2or3(journal)) - space -= sizeof(struct jbd2_journal_block_tail); - return space / record_size; -} - -/* - * Read the superblock for a given journal, performing initial - * validation of the format. - */ -static int journal_get_superblock(journal_t *journal) -{ - struct buffer_head *bh; - journal_superblock_t *sb; - int err; - - bh = journal->j_sb_buffer; - - J_ASSERT(bh != NULL); - if (buffer_verified(bh)) - return 0; - - err = bh_read(bh, 0); - if (err < 0) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } - - sb = journal->j_superblock; - - err = -EINVAL; - - if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || - sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); - goto out; - } - - if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && - be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { - printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); - goto out; - } - - if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { - printk(KERN_WARNING "JBD2: journal file too short\n"); - goto out; - } - - if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_total_len) { - printk(KERN_WARNING - "JBD2: Invalid start block of journal: %u\n", - be32_to_cpu(sb->s_first)); - goto out; - } - - if (jbd2_has_feature_csum2(journal) && - jbd2_has_feature_csum3(journal)) { - /* Can't have checksum v2 and v3 at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " - "at the same time!\n"); - goto out; - } - - if (jbd2_journal_has_csum_v2or3_feature(journal) && - jbd2_has_feature_checksum(journal)) { - /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " - "at the same time!\n"); - goto out; - } - - if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD2: Unknown checksum type\n"); - goto out; - } - - /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3_feature(journal)) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); - err = PTR_ERR(journal->j_chksum_driver); - journal->j_chksum_driver = NULL; - goto out; - } - /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } - } - set_buffer_verified(bh); - return 0; - -out: - journal_fail_superblock(journal); - return err; -} - -/* - * Load the on-disk journal superblock and read the key fields into the - * journal_t. - */ - -static int load_superblock(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - int num_fc_blocks; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); - journal->j_tail = be32_to_cpu(sb->s_start); - journal->j_first = be32_to_cpu(sb->s_first); - journal->j_errno = be32_to_cpu(sb->s_errno); - journal->j_last = be32_to_cpu(sb->s_maxlen); - - if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) - journal->j_total_len = be32_to_cpu(sb->s_maxlen); - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, - sizeof(sb->s_uuid)); - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); - - if (jbd2_has_feature_fast_commit(journal)) { - journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); - if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) - journal->j_last = journal->j_fc_last - num_fc_blocks; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } - - return 0; -} - - /** * jbd2_journal_load() - Read journal from disk. * @journal: Journal to act on. @@ -2057,28 +2070,7 @@ static int load_superblock(journal_t *journal) int jbd2_journal_load(journal_t *journal) { int err; - journal_superblock_t *sb; - - err = load_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - /* - * If this is a V2 superblock, then we have to check the - * features flags on it. - */ - if (jbd2_format_support_feature(journal)) { - if ((sb->s_feature_ro_compat & - ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || - (sb->s_feature_incompat & - ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { - printk(KERN_WARNING - "JBD2: Unrecognised features on journal\n"); - return -EINVAL; - } - } + journal_superblock_t *sb = journal->j_superblock; /* * Create a slab for this blocksize @@ -2089,8 +2081,11 @@ int jbd2_journal_load(journal_t *journal) /* Let the recovery code check whether it needs to recover any * data from the journal. */ - if (jbd2_journal_recover(journal)) - goto recovery_error; + err = jbd2_journal_recover(journal); + if (err) { + pr_warn("JBD2: journal recovery failed\n"); + return err; + } if (journal->j_failed_commit) { printk(KERN_ERR "JBD2: journal transaction %u on %s " @@ -2107,15 +2102,14 @@ int jbd2_journal_load(journal_t *journal) /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ - if (journal_reset(journal)) - goto recovery_error; + err = journal_reset(journal); + if (err) { + pr_warn("JBD2: journal reset failed\n"); + return err; + } journal->j_flags |= JBD2_LOADED; return 0; - -recovery_error: - printk(KERN_WARNING "JBD2: recovery failed\n"); - return -EIO; } /** @@ -2182,17 +2176,16 @@ int jbd2_journal_destroy(journal_t *journal) ++journal->j_transaction_sequence; write_unlock(&journal->j_state_lock); - jbd2_mark_journal_empty(journal, - REQ_SYNC | REQ_PREFLUSH | REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; brelse(journal->j_sb_buffer); } - if (journal->j_shrinker.flags & SHRINKER_REGISTERED) { + if (journal->j_shrinker) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); - unregister_shrinker(&journal->j_shrinker); + shrinker_free(journal->j_shrinker); } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); @@ -2227,8 +2220,6 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; - if (journal_get_superblock(journal)) - return 0; if (!jbd2_format_support_feature(journal)) return 0; @@ -2486,7 +2477,7 @@ int jbd2_journal_flush(journal_t *journal, unsigned int flags) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); if (flags) err = __jbd2_journal_erase(journal, flags); @@ -2518,16 +2509,12 @@ out: int jbd2_journal_wipe(journal_t *journal, int write) { - int err = 0; + int err; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); - err = load_superblock(journal); - if (err) - return err; - if (!journal->j_tail) - goto no_recovery; + return 0; printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); @@ -2536,11 +2523,10 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock_io(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } - no_recovery: return err; } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0184931d47f7..01f744cb97a4 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -230,12 +230,8 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) /* Make sure we wrap around the log correctly! */ #define wrap(journal, var) \ do { \ - unsigned long _wrap_last = \ - jbd2_has_feature_fast_commit(journal) ? \ - (journal)->j_fc_last : (journal)->j_last; \ - \ - if (var >= _wrap_last) \ - var -= (_wrap_last - (journal)->j_first); \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ } while (0) static int fc_do_one_pass(journal_t *journal, @@ -293,6 +289,8 @@ int jbd2_journal_recover(journal_t *journal) journal_superblock_t * sb; struct recovery_info info; + errseq_t wb_err; + struct address_space *mapping; memset(&info, 0, sizeof(info)); sb = journal->j_superblock; @@ -310,6 +308,9 @@ int jbd2_journal_recover(journal_t *journal) return 0; } + wb_err = 0; + mapping = journal->j_fs_dev->bd_inode->i_mapping; + errseq_check_and_advance(&mapping->wb_err, &wb_err); err = do_one_pass(journal, &info, PASS_SCAN); if (!err) err = do_one_pass(journal, &info, PASS_REVOKE); @@ -333,6 +334,9 @@ int jbd2_journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; + err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err); + if (!err) + err = err2; /* Make sure all replayed data is on permanent storage */ if (journal->j_flags & JBD2_BARRIER) { err2 = blkdev_issue_flush(journal->j_fs_dev); @@ -524,9 +528,7 @@ static int do_one_pass(journal_t *journal, break; jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", - next_commit_ID, next_log_block, - jbd2_has_feature_fast_commit(journal) ? - journal->j_fc_last : journal->j_last); + next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit @@ -638,7 +640,7 @@ static int do_one_pass(journal_t *journal, success = err; printk(KERN_ERR "JBD2: IO error %d recovering " - "block %ld in log\n", + "block %lu in log\n", err, io_block); } else { unsigned long long blocknr; @@ -667,7 +669,8 @@ static int do_one_pass(journal_t *journal, printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "data block %llu in " - "log\n", blocknr); + "journal block %lu\n", + blocknr, io_block); block_error = 1; goto skip_write; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4d1fda1f7143..5f08b5fd105a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -935,19 +935,15 @@ static void warn_dirty_buffer(struct buffer_head *bh) /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ static void jbd2_freeze_jh_data(struct journal_head *jh) { - struct page *page; - int offset; char *source; struct buffer_head *bh = jh2bh(jh); J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); - page = bh->b_page; - offset = offset_in_page(bh->b_data); - source = kmap_atomic(page); + source = kmap_local_folio(bh->b_folio, bh_offset(bh)); /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); - memcpy(jh->b_frozen_data, source + offset, bh->b_size); - kunmap_atomic(source); + jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers); + memcpy(jh->b_frozen_data, source, bh->b_size); + kunmap_local(source); /* * Now that the frozen data is saved off, we need to store any matching diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 5075a0a6d594..2b2938970da3 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -204,7 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, if (ret) goto fail; - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(ri->ctime)))); jffs2_free_raw_inode(ri); @@ -237,7 +238,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) if (dead_f->inocache) set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(now))); return ret; } /***********************************************************************/ @@ -271,7 +273,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, d_inode(old_dentry)); - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(now))); ihold(d_inode(old_dentry)); } return ret; @@ -422,7 +425,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime)))); jffs2_free_raw_dirent(rd); @@ -566,7 +570,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime)))); inc_nlink(dir_i); jffs2_free_raw_dirent(rd); @@ -607,7 +612,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, f, now); if (!ret) { - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(now))); clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } @@ -743,7 +749,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime)))); jffs2_free_raw_dirent(rd); @@ -864,14 +871,18 @@ static int jffs2_rename (struct mnt_idmap *idmap, * caller won't do it on its own since we are returning an error. */ d_invalidate(new_dentry); - new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); + inode_set_mtime_to_ts(new_dir_i, + inode_set_ctime_to_ts(new_dir_i, ITIME(now))); return ret; } if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); - new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); + inode_set_mtime_to_ts(old_dir_i, + inode_set_ctime_to_ts(old_dir_i, ITIME(now))); + inode_set_mtime_to_ts(new_dir_i, + inode_set_ctime_to_ts(new_dir_i, ITIME(now))); return 0; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 2345ca3f09ee..62ea76da7fdf 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -317,7 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, inode->i_size = pos + writtenlen; inode->i_blocks = (inode->i_size + 511) >> 9; - inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); + inode_set_mtime_to_ts(inode, + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)))); } } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 038516bee1ab..d175cccb7c55 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -113,9 +113,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size); - ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime)); - ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime)); - ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime)); + ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode_get_atime(inode))); + ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode_get_mtime(inode))); + ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode))); ri->offset = cpu_to_je32(0); ri->csize = ri->dsize = cpu_to_je32(mdatalen); @@ -147,9 +147,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return PTR_ERR(new_metadata); } /* It worked. Update the inode */ - inode->i_atime = ITIME(je32_to_cpu(ri->atime)); - inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); - inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); + inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(ri->atime))); + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))); + inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(ri->mtime))); inode->i_mode = jemode_to_cpu(ri->mode); i_uid_write(inode, je16_to_cpu(ri->uid)); i_gid_write(inode, je16_to_cpu(ri->gid)); @@ -282,9 +282,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) i_uid_write(inode, je16_to_cpu(latest_node.uid)); i_gid_write(inode, je16_to_cpu(latest_node.gid)); inode->i_size = je32_to_cpu(latest_node.isize); - inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); - inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); - inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); + inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(latest_node.atime))); + inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(latest_node.mtime))); + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime))); set_nlink(inode, f->inocache->pino_nlink); @@ -386,9 +386,9 @@ void jffs2_dirty_inode(struct inode *inode, int flags) iattr.ia_mode = inode->i_mode; iattr.ia_uid = inode->i_uid; iattr.ia_gid = inode->i_gid; - iattr.ia_atime = inode->i_atime; - iattr.ia_mtime = inode->i_mtime; - iattr.ia_ctime = inode->i_ctime; + iattr.ia_atime = inode_get_atime(inode); + iattr.ia_mtime = inode_get_mtime(inode); + iattr.ia_ctime = inode_get_ctime(inode); jffs2_do_setattr(inode, &iattr); } @@ -475,8 +475,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_mode = jemode_to_cpu(ri->mode); i_gid_write(inode, je16_to_cpu(ri->gid)); i_uid_write(inode, je16_to_cpu(ri->uid)); - inode->i_atime = inode->i_ctime = inode->i_mtime = current_time(inode); - ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); + simple_inode_init_ts(inode); + ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode_get_mtime(inode))); inode->i_blocks = 0; inode->i_size = 0; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 8da19766c101..86ab014a349c 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -35,9 +35,9 @@ struct kvec; #define ITIME(sec) ((struct timespec64){sec, 0}) #define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds()) #define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec) -#define JFFS2_F_I_CTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_ctime) -#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime) -#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime) +#define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f))) +#define JFFS2_F_I_MTIME(f) I_SEC(inode_get_mtime(OFNI_EDONI_2SFFJ(f))) +#define JFFS2_F_I_ATIME(f) I_SEC(inode_get_atime(OFNI_EDONI_2SFFJ(f))) #define sleep_on_spinunlock(wq, s) \ do { \ DECLARE_WAITQUEUE(__wait, current); \ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 7ea37f49f1e1..f99591a634b4 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -150,6 +150,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child) } static const struct export_operations jffs2_export_ops = { + .encode_fh = generic_encode_ino32_fh, .get_parent = jffs2_get_parent, .fh_to_dentry = jffs2_fh_to_dentry, .fh_to_parent = jffs2_fh_to_parent, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 3b6bdc9a49e1..00224f3a8d6e 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -920,7 +920,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) * is an implementation of setxattr handler on jffs2. * -------------------------------------------------- */ -const struct xattr_handler *jffs2_xattr_handlers[] = { +const struct xattr_handler * const jffs2_xattr_handlers[] = { &jffs2_user_xattr_handler, #ifdef CONFIG_JFFS2_FS_SECURITY &jffs2_security_xattr_handler, diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 1b5030a3349d..7e7de093ec0a 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -94,7 +94,7 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, const char *buffer, size_t size, int flags); -extern const struct xattr_handler *jffs2_xattr_handlers[]; +extern const struct xattr_handler * const jffs2_xattr_handlers[]; extern const struct xattr_handler jffs2_user_xattr_handler; extern const struct xattr_handler jffs2_trusted_xattr_handler; diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig index 51e856f0e4b8..3728cf4d944d 100644 --- a/fs/jfs/Kconfig +++ b/fs/jfs/Kconfig @@ -1,7 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only config JFS_FS tristate "JFS filesystem support" + select BUFFER_HEAD select NLS + select NLS_UCS2_UTILS select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile index 7156d2c218c7..b769bbf8bdc2 100644 --- a/fs/jfs/Makefile +++ b/fs/jfs/Makefile @@ -9,7 +9,7 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \ jfs_extent.o symlink.o jfs_metapage.o \ - jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \ + jfs_logmgr.o jfs_txnmgr.o \ resize.o xattr.o ioctl.o jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index fb96f872d207..1de3602c98de 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -116,7 +116,7 @@ int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, if (!rc) { if (update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } rc = txCommit(tid, 1, &inode, 0); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 8ac10e396050..1a6b5921d17a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) break; } - ip->i_mtime = ip->i_ctime = current_time(ip); + inode_set_mtime_to_ts(ip, inode_set_ctime_current(ip)); mark_inode_dirty(ip); txCommit(tid, 1, &ip, 0); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index ed7989bc2db1..f7bd7e8f5be4 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ int jfs_fileattr_set(struct mnt_idmap *idmap, jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h index 6b231d0d0071..603aae17a693 100644 --- a/fs/jfs/jfs_dinode.h +++ b/fs/jfs/jfs_dinode.h @@ -96,7 +96,7 @@ struct dinode { #define di_gengen u._file._u1._imap._gengen union { - xtpage_t _xtroot; + xtroot_t _xtroot; struct { u8 unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index a14a0f18a4c4..11c77757ead9 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -87,7 +87,7 @@ static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); static int dbFindBits(u32 word, int l2nb); static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); -static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); +static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl); static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks); static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, @@ -180,7 +180,8 @@ int dbMount(struct inode *ipbmap) bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); - if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) { + if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE || + bmp->db_l2nbperpage < 0) { err = -EINVAL; goto err_release_metapage; } @@ -194,6 +195,12 @@ int dbMount(struct inode *ipbmap) bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); + if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 || + bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) { + err = -EINVAL; + goto err_release_metapage; + } + bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); @@ -269,6 +276,7 @@ int dbUnmount(struct inode *ipbmap, int mounterror) /* free the memory for the in-memory bmap. */ kfree(bmp); + JFS_SBI(ipbmap->i_sb)->bmap = NULL; return (0); } @@ -1709,7 +1717,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) * dbFindLeaf() returns the index of the leaf at which * free space was found. */ - rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx); + rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx, true); /* release the buffer. */ @@ -1956,7 +1964,7 @@ dbAllocDmapLev(struct bmap * bmp, * free space. if sufficient free space is found, dbFindLeaf() * returns the index of the leaf at which free space was found. */ - if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) + if (dbFindLeaf((dmtree_t *) &dp->tree, l2nb, &leafidx, false)) return -ENOSPC; if (leafidx < 0) @@ -2920,14 +2928,18 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) * leafidx - return pointer to be set to the index of the leaf * describing at least l2nb free blocks if sufficient * free blocks are found. + * is_ctl - determines if the tree is of type ctl * * RETURN VALUES: * 0 - success * -ENOSPC - insufficient free blocks. */ -static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) +static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl) { int ti, n = 0, k, x = 0; + int max_size; + + max_size = is_ctl ? CTLTREESIZE : TREESIZE; /* first check the root of the tree to see if there is * sufficient free space. @@ -2948,6 +2960,8 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) /* sufficient free space found. move to the next * level (or quit if this is the last level). */ + if (x + n > max_size) + return -ENOSPC; if (l2nb <= tp->dmt_stree[x + n]) break; } diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index ae99a7e232ee..63d21822d309 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -166,7 +166,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) /* * COMMIT_SyncList flags an anonymous tlock on page that is on * sync list. - * We need to commit the inode to get the page written disk. + * We need to commit the inode to get the page written to the disk. */ if (test_and_clear_cflag(COMMIT_Synclist,ip)) jfs_commit_inode(ip, 0); @@ -311,6 +311,11 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * blocks in the map. in that case, we'll start off with the * maximum free. */ + + /* give up if no space left */ + if (bmp->db_maxfreebud == -1) + return -ENOSPC; + max = (s64) 1 << bmp->db_maxfreebud; if (*nblocks >= max && *nblocks > nbperpage) nb = nblks = (max > nbperpage) ? max : nbperpage; diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 390cbfce391f..a037ee59e398 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -193,6 +193,7 @@ int diUnmount(struct inode *ipimap, int mounterror) * free in-memory control structure */ kfree(imap); + JFS_IP(ipimap)->i_imap = NULL; return (0); } @@ -669,7 +670,7 @@ int diWrite(tid_t tid, struct inode *ip) * This is the special xtree inside the directory for storing * the directory table */ - xtpage_t *p, *xp; + xtroot_t *p, *xp; xad_t *xad; jfs_ip->xtlid = 0; @@ -683,7 +684,7 @@ int diWrite(tid_t tid, struct inode *ip) * copy xtree root from inode to dinode: */ p = &jfs_ip->i_xtroot; - xp = (xtpage_t *) &dp->di_dirtable; + xp = (xtroot_t *) &dp->di_dirtable; lv = ilinelock->lv; for (n = 0; n < ilinelock->index; n++, lv++) { memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], @@ -712,7 +713,7 @@ int diWrite(tid_t tid, struct inode *ip) * regular file: 16 byte (XAD slot) granularity */ if (type & tlckXTREE) { - xtpage_t *p, *xp; + xtroot_t *p, *xp; xad_t *xad; /* @@ -1319,7 +1320,7 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) int diAlloc(struct inode *pip, bool dir, struct inode *ip) { int rc, ino, iagno, addext, extno, bitno, sword; - int nwords, rem, i, agno; + int nwords, rem, i, agno, dn_numag; u32 mask, inosmap, extsmap; struct inode *ipimap; struct metapage *mp; @@ -1355,6 +1356,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) /* get the ag number of this iag */ agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); + dn_numag = JFS_SBI(pip->i_sb)->bmap->db_numag; + if (agno < 0 || agno > dn_numag) + return -EIO; if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { /* @@ -3060,12 +3064,12 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) } ip->i_size = le64_to_cpu(dip->di_size); - ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec); - ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); - ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); - ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); - ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); - ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); + inode_set_atime(ip, le32_to_cpu(dip->di_atime.tv_sec), + le32_to_cpu(dip->di_atime.tv_nsec)); + inode_set_mtime(ip, le32_to_cpu(dip->di_mtime.tv_sec), + le32_to_cpu(dip->di_mtime.tv_nsec)); + inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec), + le32_to_cpu(dip->di_ctime.tv_nsec)); ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); ip->i_generation = le32_to_cpu(dip->di_gen); @@ -3137,12 +3141,12 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) else /* Leave the original permissions alone */ dip->di_mode = cpu_to_le32(jfs_ip->mode2); - dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); - dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); - dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); - dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); - dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); - dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); + dip->di_atime.tv_sec = cpu_to_le32(inode_get_atime_sec(ip)); + dip->di_atime.tv_nsec = cpu_to_le32(inode_get_atime_nsec(ip)); + dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime_sec(ip)); + dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime_nsec(ip)); + dip->di_mtime.tv_sec = cpu_to_le32(inode_get_mtime_sec(ip)); + dip->di_mtime.tv_nsec = cpu_to_le32(inode_get_mtime_nsec(ip)); dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ dip->di_acl = jfs_ip->acl; /* as are dxd's */ dip->di_ea = jfs_ip->ea; diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 721def69e732..dd4264aa9bed 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -66,7 +66,7 @@ struct jfs_inode_info { lid_t xtlid; /* lid of xtree lock on directory */ union { struct { - xtpage_t _xtroot; /* 288: xtree root */ + xtroot_t _xtroot; /* 288: xtree root */ struct inomap *_imap; /* 4: inode map header */ } file; struct { diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 9e1f02767201..f10f295d1502 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - jfs_inode->otime = inode->i_ctime.tv_sec; + simple_inode_init_ts(inode); + jfs_inode->otime = inode_get_ctime_sec(inode); inode->i_generation = JFS_SBI(sb)->gengen++; jfs_inode->cflag = 0; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index e855b8fde76c..cb6d1fda66a7 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) int lmLogOpen(struct super_block *sb) { int rc; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct jfs_log *log; struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb) mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { - if (log->bdev->bd_dev == sbi->logdev) { + if (log->bdev_handle->bdev->bd_dev == sbi->logdev) { if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); @@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, - log, NULL); - if (IS_ERR(bdev)) { - rc = PTR_ERR(bdev); + bdev_handle = bdev_open_by_dev(sbi->logdev, + BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); + if (IS_ERR(bdev_handle)) { + rc = PTR_ERR(bdev_handle); goto free; } - log->bdev = bdev; + log->bdev_handle = bdev_handle; uuid_copy(&log->uuid, &sbi->loguuid); /* @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - blkdev_put(bdev, log); + bdev_release(bdev_handle); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb) init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); - log->bdev = sb->s_bdev; + log->bdev_handle = sb->s_bdev_handle; log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); @@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; - struct block_device *bdev; + struct bdev_handle *bdev_handle; int rc = 0; jfs_info("lmLogClose: log:0x%p", log); @@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb) * external log as separate logical volume */ list_del(&log->journal_list); - bdev = log->bdev; + bdev_handle = log->bdev_handle; rc = lmLogShutdown(log); - blkdev_put(bdev, log); + bdev_release(bdev_handle); kfree(log); @@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bp->l_flag |= lbmREAD; - bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS); + bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2110,10 +2110,15 @@ static void lbmStartIO(struct lbuf * bp) { struct bio *bio; struct jfs_log *log = bp->l_log; + struct block_device *bdev = NULL; jfs_info("lbmStartIO"); - bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); + if (!log->no_integrity) + bdev = log->bdev_handle->bdev; + + bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC, + GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 805877ce5020..84aa2d253907 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -356,7 +356,7 @@ struct jfs_log { * before writing syncpt. */ struct list_head journal_list; /* Global list */ - struct block_device *bdev; /* 4: log lv pointer */ + struct bdev_handle *bdev_handle; /* 4: log lv pointer */ int serial; /* 4: log mount serial number */ s64 base; /* @8: log extent address (inline log ) */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index b83aae56a1f2..415eb65a36ff 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -430,7 +430,8 @@ int updateSuper(struct super_block *sb, uint state) if (state == FM_MOUNT) { /* record log's dev_t and mount serial number */ - j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev)); + j_sb->s_logdev = cpu_to_le32( + new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev)); j_sb->s_logserial = cpu_to_le32(sbi->log->serial); } else if (state == FM_CLEAN) { /* diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index ce4b4760fcb1..dccc8b3f1045 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -783,7 +783,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, if (mp->xflag & COMMIT_PAGE) p = (xtpage_t *) mp->data; else - p = &jfs_ip->i_xtroot; + p = (xtpage_t *) &jfs_ip->i_xtroot; xtlck->lwm.offset = le16_to_cpu(p->header.nextindex); } @@ -1676,7 +1676,7 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, if (tlck->type & tlckBTROOT) { lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); - p = &JFS_IP(ip)->i_xtroot; + p = (xtpage_t *) &JFS_IP(ip)->i_xtroot; if (S_ISDIR(ip->i_mode)) lrd->log.redopage.type |= cpu_to_le16(LOG_DIR_XTREE); diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h index 9db62d047daa..b6a78d4aef1b 100644 --- a/fs/jfs/jfs_unicode.h +++ b/fs/jfs/jfs_unicode.h @@ -8,16 +8,9 @@ #include <linux/slab.h> #include <asm/byteorder.h> +#include "../nls/nls_ucs2_data.h" #include "jfs_types.h" -typedef struct { - wchar_t start; - wchar_t end; - signed char *table; -} UNICASERANGE; - -extern signed char UniUpperTable[512]; -extern UNICASERANGE UniUpperRange[]; extern int get_UCSname(struct component_name *, struct dentry *); extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *); @@ -107,12 +100,12 @@ static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2, */ static inline wchar_t UniToupper(wchar_t uc) { - UNICASERANGE *rp; + const struct UniCaseRange *rp; - if (uc < sizeof(UniUpperTable)) { /* Latin characters */ - return uc + UniUpperTable[uc]; /* Use base tables */ + if (uc < sizeof(NlsUniUpperTable)) { /* Latin characters */ + return uc + NlsUniUpperTable[uc]; /* Use base tables */ } else { - rp = UniUpperRange; /* Use range tables */ + rp = NlsUniUpperRange; /* Use range tables */ while (rp->start) { if (uc < rp->start) /* Before start of range */ return uc; /* Uppercase = input */ diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c deleted file mode 100644 index d0b18c7befb8..000000000000 --- a/fs/jfs/jfs_uniupr.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) International Business Machines Corp., 2000-2002 - */ - -#include <linux/fs.h> -#include "jfs_unicode.h" - -/* - * Latin upper case - */ -signed char UniUpperTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ - 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */ - -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ - 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ - 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ - 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ - -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ - 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ -}; - -/* Upper case range - Greek */ -static signed char UniCaseRangeU03a0[47] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */ - 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */ - -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63, -}; - -/* Upper case range - Cyrillic */ -static signed char UniCaseRangeU0430[48] = { - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */ - 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */ -}; - -/* Upper case range - Extended cyrillic */ -static signed char UniCaseRangeU0490[61] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ - 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, -}; - -/* Upper case range - Extended latin and greek */ -static signed char UniCaseRangeU1e00[509] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ - 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ - 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ - 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ - 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ - 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ - 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Upper case range - Wide latin */ -static signed char UniCaseRangeUff40[27] = { - 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */ - -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, -}; - -/* - * Upper Case Range - */ -UNICASERANGE UniUpperRange[] = { - { 0x03a0, 0x03ce, UniCaseRangeU03a0 }, - { 0x0430, 0x045f, UniCaseRangeU0430 }, - { 0x0490, 0x04cc, UniCaseRangeU0490 }, - { 0x1e00, 0x1ffc, UniCaseRangeU1e00 }, - { 0xff40, 0xff5a, UniCaseRangeUff40 }, - { 0 } -}; diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index 0d33816d251d..ec67d8554d2c 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -46,7 +46,7 @@ extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *, extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); -extern const struct xattr_handler *jfs_xattr_handlers[]; +extern const struct xattr_handler * const jfs_xattr_handlers[]; #ifdef CONFIG_JFS_SECURITY extern int jfs_init_security(tid_t, struct inode *, struct inode *, diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 2d304cee884c..5ee618d17e77 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -1213,7 +1213,7 @@ xtSplitRoot(tid_t tid, struct xtlock *xtlck; int rc; - sp = &JFS_IP(ip)->i_xtroot; + sp = (xtpage_t *) &JFS_IP(ip)->i_xtroot; INCREMENT(xtStat.split); @@ -2098,7 +2098,7 @@ int xtAppend(tid_t tid, /* transaction id */ */ void xtInitRoot(tid_t tid, struct inode *ip) { - xtpage_t *p; + xtroot_t *p; /* * acquire a transaction lock on the root diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index ad7592191d76..0f6cf5a1ce75 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -65,24 +65,33 @@ struct xadlist { #define XTPAGEMAXSLOT 256 #define XTENTRYSTART 2 -/* - * xtree page: - */ -typedef union { - struct xtheader { - __le64 next; /* 8: */ - __le64 prev; /* 8: */ +struct xtheader { + __le64 next; /* 8: */ + __le64 prev; /* 8: */ - u8 flag; /* 1: */ - u8 rsrvd1; /* 1: */ - __le16 nextindex; /* 2: next index = number of entries */ - __le16 maxentry; /* 2: max number of entries */ - __le16 rsrvd2; /* 2: */ + u8 flag; /* 1: */ + u8 rsrvd1; /* 1: */ + __le16 nextindex; /* 2: next index = number of entries */ + __le16 maxentry; /* 2: max number of entries */ + __le16 rsrvd2; /* 2: */ - pxd_t self; /* 8: self */ - } header; /* (32) */ + pxd_t self; /* 8: self */ +}; +/* + * xtree root (in inode): + */ +typedef union { + struct xtheader header; xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */ +} xtroot_t; + +/* + * xtree page: + */ +typedef union { + struct xtheader header; + xad_t xad[XTPAGEMAXSLOT]; /* 16 * maxentry: xad array */ } xtpage_t; /* diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index e98ddb2b1cf2..d68a4e6ac345 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = current_time(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); @@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, /* update parent directory inode */ inc_nlink(dip); /* for '..' from child directory */ - dip->i_ctime = dip->i_mtime = current_time(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); rc = txCommit(tid, 2, &iplist[0], 0); @@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) /* update parent directory's link count corresponding * to ".." entry of the target directory deleted */ - dip->i_ctime = dip->i_mtime = current_time(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); inode_dec_link_count(dip); /* @@ -512,7 +512,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) ASSERT(ip->i_nlink); - ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip); + inode_set_mtime_to_ts(dip, + inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip))); mark_inode_dirty(dip); /* update target's inode */ @@ -827,8 +828,8 @@ static int jfs_link(struct dentry *old_dentry, /* update object inode */ inc_nlink(ip); /* for new link */ - ip->i_ctime = current_time(ip); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_ctime_current(ip); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ihold(ip); @@ -883,7 +884,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, struct component_name dname; u32 ssize; /* source pathname size */ struct btstack btstack; - struct inode *ip = d_inode(dentry); + struct inode *ip; s64 xlen = 0; int bmask = 0, xsize; s64 xaddr; @@ -1028,7 +1029,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = current_time(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); /* * commit update of parent directory and link object @@ -1205,7 +1206,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, tblk->xflag |= COMMIT_DELETE; tblk->u.ip = new_ip; } else { - new_ip->i_ctime = current_time(new_ip); + inode_set_ctime_current(new_ip); mark_inode_dirty(new_ip); } } else { @@ -1268,10 +1269,10 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, /* * Update ctime on changed/moved inodes & mark dirty */ - old_ip->i_ctime = current_time(old_ip); + inode_set_ctime_current(old_ip); mark_inode_dirty(old_ip); - new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); mark_inode_dirty(new_dir); /* Build list of inodes modified by this transaction */ @@ -1283,7 +1284,8 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (old_dir != new_dir) { iplist[ipcount++] = new_dir; - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + inode_set_mtime_to_ts(old_dir, + inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); } @@ -1416,7 +1418,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, mark_inode_dirty(ip); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d2f82cb7db1b..8d8e556bd610 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -818,7 +818,7 @@ out: } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; @@ -896,6 +896,7 @@ static const struct super_operations jfs_super_operations = { }; static const struct export_operations jfs_export_operations = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = jfs_fh_to_dentry, .fh_to_parent = jfs_fh_to_parent, .get_parent = jfs_get_parent, diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 931e50018f88..0fb7afac298e 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -647,7 +647,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, if (old_blocks) dquot_free_block(inode, old_blocks); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); return 0; } @@ -985,7 +985,7 @@ static const struct xattr_handler jfs_trusted_xattr_handler = { .set = jfs_xattr_set, }; -const struct xattr_handler *jfs_xattr_handlers[] = { +const struct xattr_handler * const jfs_xattr_handlers[] = { &jfs_os2_xattr_handler, &jfs_user_xattr_handler, &jfs_security_xattr_handler, diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 5d826274570c..c429c42a6867 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -8,16 +8,16 @@ /** * kernel_read_file() - read file contents into a kernel buffer * - * @file file to read from - * @offset where to start reading from (see below). - * @buf pointer to a "void *" buffer for reading into (if + * @file: file to read from + * @offset: where to start reading from (see below). + * @buf: pointer to a "void *" buffer for reading into (if * *@buf is NULL, a buffer will be allocated, and * @buf_size will be ignored) - * @buf_size size of buf, if already allocated. If @buf not + * @buf_size: size of buf, if already allocated. If @buf not * allocated, this is the largest size to allocate. - * @file_size if non-NULL, the full size of @file will be + * @file_size: if non-NULL, the full size of @file will be * written here. - * @id the kernel_read_file_id identifying the type of + * @id: the kernel_read_file_id identifying the type of * file contents being read (for LSMs to examine) * * @offset must be 0 unless both @buf and @file_size are non-NULL diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5a1a4af9d3d2..8b2bd65d70e7 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -383,9 +383,11 @@ static int kernfs_link_sibling(struct kernfs_node *kn) rb_insert_color(&kn->rb, &kn->parent->dir.children); /* successfully added, account subdir number */ + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; kernfs_inc_rev(kn->parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } @@ -408,9 +410,11 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn) if (RB_EMPTY_NODE(&kn->rb)) return false; + down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; kernfs_inc_rev(kn->parent); + up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); @@ -556,7 +560,7 @@ void kernfs_put(struct kernfs_node *kn) kfree_const(kn->name); if (kn->iattr) { - simple_xattrs_free(&kn->iattr->xattrs); + simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 180906c36f51..f0cb729e9a97 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -429,60 +429,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, return ret; } -#ifdef CONFIG_NUMA -static int kernfs_vma_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!kernfs_get_active(of->kn)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - kernfs_put_active(of->kn); - return ret; -} - -static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!kernfs_get_active(of->kn)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr); - - kernfs_put_active(of->kn); - return pol; -} - -#endif - static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, -#ifdef CONFIG_NUMA - .set_policy = kernfs_vma_set_policy, - .get_policy = kernfs_vma_get_policy, -#endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) @@ -903,6 +854,33 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) return ret; } +static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) +{ + struct kernfs_open_file *of = kernfs_of(file); + const struct kernfs_ops *ops; + loff_t ret; + + /* + * @of->mutex nests outside active ref and is primarily to ensure that + * the ops aren't called concurrently for the same open file. + */ + mutex_lock(&of->mutex); + if (!kernfs_get_active(of->kn)) { + mutex_unlock(&of->mutex); + return -ENODEV; + } + + ops = kernfs_ops(of->kn); + if (ops->llseek) + ret = ops->llseek(of, offset, whence); + else + ret = generic_file_llseek(file, offset, whence); + + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + return ret; +} + static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; @@ -1005,7 +983,7 @@ EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, - .llseek = generic_file_llseek, + .llseek = kernfs_fop_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index b22b74d1a115..b83054da68b3 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -151,8 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode *inode, @@ -160,9 +159,9 @@ static inline void set_inode_attr(struct inode *inode, { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; - inode->i_atime = attrs->ia_atime; - inode->i_mtime = attrs->ia_mtime; - inode->i_ctime = attrs->ia_ctime; + inode_set_atime_to_ts(inode, attrs->ia_atime); + inode_set_mtime_to_ts(inode, attrs->ia_mtime); + inode_set_ctime_to_ts(inode, attrs->ia_ctime); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) @@ -191,7 +190,7 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap, down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); up_read(&root->kernfs_iattr_rwsem); return 0; @@ -306,11 +305,17 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name, int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { + struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; - return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); + old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, @@ -342,7 +347,7 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; + struct simple_xattr *old_xattr; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { @@ -355,13 +360,18 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, goto dec_size_out; } - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); - - if (!ret && removed_size >= 0) - size = removed_size; - else if (!ret) + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) return 0; + + if (IS_ERR(old_xattr)) { + ret = PTR_ERR(old_xattr); + goto dec_size_out; + } + + ret = 0; + size = old_xattr->size; + simple_xattr_free(old_xattr); dec_size_out: atomic_sub(size, sz); dec_count_out: @@ -376,18 +386,19 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; - int ret; + struct simple_xattr *old_xattr; - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) + return 0; - if (removed_size >= 0) { - atomic_sub(removed_size, sz); - atomic_dec(nr); - } + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); - return ret; + atomic_sub(old_xattr->size, sz); + atomic_dec(nr); + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, @@ -434,7 +445,7 @@ static const struct xattr_handler kernfs_user_xattr_handler = { .set = kernfs_vfs_user_xattr_set, }; -const struct xattr_handler *kernfs_xattr_handlers[] = { +const struct xattr_handler * const kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, &kernfs_user_xattr_handler, diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index a9b854cdfdb5..237f2764b941 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -127,7 +127,7 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ -extern const struct xattr_handler *kernfs_xattr_handlers[]; +extern const struct xattr_handler * const kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d49606accb07..4628edde2e7e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -16,11 +16,14 @@ #include <linux/namei.h> #include <linux/seq_file.h> #include <linux/exportfs.h> +#include <linux/uuid.h> +#include <linux/statfs.h> #include "kernfs-internal.h" -struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; -struct kernfs_global_locks *kernfs_locks; +struct kmem_cache *kernfs_node_cache __ro_after_init; +struct kmem_cache *kernfs_iattrs_cache __ro_after_init; +struct kernfs_global_locks *kernfs_locks __ro_after_init; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { @@ -45,8 +48,15 @@ static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) return 0; } +static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + simple_statfs(dentry, buf); + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; +} + const struct super_operations kernfs_sops = { - .statfs = simple_statfs, + .statfs = kernfs_statfs, .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, @@ -256,7 +266,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_time_gran = 1; /* sysfs dentries and inodes don't require IO to create */ - sb->s_shrink.seeks = 0; + sb->s_shrink->seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); @@ -351,6 +361,8 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; + uuid_gen(&sb->s_uuid); + down_write(&root->kernfs_supers_rwsem); list_add(&info->node, &info->root->supers); up_write(&root->kernfs_supers_rwsem); diff --git a/fs/libfs.c b/fs/libfs.c index 5b851315eeed..c2aa6fd4795c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -33,7 +33,7 @@ int simple_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -41,6 +41,9 @@ EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { + u64 id = huge_encode_dev(dentry->d_sb->s_dev); + + buf->f_fsid = u64_to_fsid(id); buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; @@ -239,6 +242,262 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +static void offset_set(struct dentry *dentry, u32 offset) +{ + dentry->d_fsdata = (void *)((uintptr_t)(offset)); +} + +static u32 dentry2offset(struct dentry *dentry) +{ + return (u32)((uintptr_t)(dentry->d_fsdata)); +} + +static struct lock_class_key simple_offset_xa_lock; + +/** + * simple_offset_init - initialize an offset_ctx + * @octx: directory offset map to be initialized + * + */ +void simple_offset_init(struct offset_ctx *octx) +{ + xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); + lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); + + /* 0 is '.', 1 is '..', so always start with offset 2 */ + octx->next_offset = 2; +} + +/** + * simple_offset_add - Add an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: new dentry being added + * + * Returns zero on success. @so_ctx and the dentry offset are updated. + * Otherwise, a negative errno value is returned. + */ +int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) +{ + static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); + u32 offset; + int ret; + + if (dentry2offset(dentry) != 0) + return -EBUSY; + + ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, + &octx->next_offset, GFP_KERNEL); + if (ret < 0) + return ret; + + offset_set(dentry, offset); + return 0; +} + +/** + * simple_offset_remove - Remove an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: dentry being removed + * + */ +void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) +{ + u32 offset; + + offset = dentry2offset(dentry); + if (offset == 0) + return; + + xa_erase(&octx->xa, offset); + offset_set(dentry, 0); +} + +/** + * simple_offset_rename_exchange - exchange rename with directory offsets + * @old_dir: parent of dentry being moved + * @old_dentry: dentry being moved + * @new_dir: destination parent + * @new_dentry: destination dentry + * + * Returns zero on success. Otherwise a negative errno is returned and the + * rename is rolled back. + */ +int simple_offset_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); + struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); + u32 old_index = dentry2offset(old_dentry); + u32 new_index = dentry2offset(new_dentry); + int ret; + + simple_offset_remove(old_ctx, old_dentry); + simple_offset_remove(new_ctx, new_dentry); + + ret = simple_offset_add(new_ctx, old_dentry); + if (ret) + goto out_restore; + + ret = simple_offset_add(old_ctx, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + goto out_restore; + } + + ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + simple_offset_remove(old_ctx, new_dentry); + goto out_restore; + } + return 0; + +out_restore: + offset_set(old_dentry, old_index); + xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + offset_set(new_dentry, new_index); + xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + return ret; +} + +/** + * simple_offset_destroy - Release offset map + * @octx: directory offset ctx that is about to be destroyed + * + * During fs teardown (eg. umount), a directory's offset map might still + * contain entries. xa_destroy() cleans out anything that remains. + */ +void simple_offset_destroy(struct offset_ctx *octx) +{ + xa_destroy(&octx->xa); +} + +/** + * offset_dir_llseek - Advance the read position of a directory descriptor + * @file: an open directory whose position is to be updated + * @offset: a byte offset + * @whence: enumerator describing the starting position for this update + * + * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. + * + * Returns the updated read position if successful; otherwise a + * negative errno is returned and the read position remains unchanged. + */ +static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_CUR: + offset += file->f_pos; + fallthrough; + case SEEK_SET: + if (offset >= 0) + break; + fallthrough; + default: + return -EINVAL; + } + + /* In this case, ->private_data is protected by f_pos_lock */ + file->private_data = NULL; + return vfs_setpos(file, offset, U32_MAX); +} + +static struct dentry *offset_find_next(struct xa_state *xas) +{ + struct dentry *child, *found = NULL; + + rcu_read_lock(); + child = xas_next_entry(xas, U32_MAX); + if (!child) + goto out; + spin_lock(&child->d_lock); + if (simple_positive(child)) + found = dget_dlock(child); + spin_unlock(&child->d_lock); +out: + rcu_read_unlock(); + return found; +} + +static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) +{ + u32 offset = dentry2offset(dentry); + struct inode *inode = d_inode(dentry); + + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, + inode->i_ino, fs_umode_to_dtype(inode->i_mode)); +} + +static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +{ + struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); + XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct dentry *dentry; + + while (true) { + dentry = offset_find_next(&xas); + if (!dentry) + return ERR_PTR(-ENOENT); + + if (!offset_dir_emit(ctx, dentry)) { + dput(dentry); + break; + } + + dput(dentry); + ctx->pos = xas.xa_index + 1; + } + return NULL; +} + +/** + * offset_readdir - Emit entries starting at offset @ctx->pos + * @file: an open directory to iterate over + * @ctx: directory iteration context + * + * Caller must hold @file's i_rwsem to prevent insertion or removal of + * entries during this call. + * + * On entry, @ctx->pos contains an offset that represents the first entry + * to be read from the directory. + * + * The operation continues until there are no more entries to read, or + * until the ctx->actor indicates there is no more space in the caller's + * output buffer. + * + * On return, @ctx->pos contains an offset that will read the next entry + * in this directory when offset_readdir() is called again with @ctx. + * + * Return values: + * %0 - Complete + */ +static int offset_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dir = file->f_path.dentry; + + lockdep_assert_held(&d_inode(dir)->i_rwsem); + + if (!dir_emit_dots(file, ctx)) + return 0; + + /* In this case, ->private_data is protected by f_pos_lock */ + if (ctx->pos == 2) + file->private_data = NULL; + else if (file->private_data == ERR_PTR(-ENOENT)) + return 0; + file->private_data = offset_iterate_dir(d_inode(dir), ctx); + return 0; +} + +const struct file_operations simple_offset_dir_operations = { + .llseek = offset_dir_llseek, + .iterate_shared = offset_readdir, + .read = generic_read_dir, + .fsync = noop_fsync, +}; + static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL; @@ -275,7 +534,7 @@ void simple_recursive_removal(struct dentry *dentry, while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); clear_nlink(inode); inode_unlock(inode); victim = this; @@ -293,8 +552,8 @@ void simple_recursive_removal(struct dentry *dentry, dput(victim); // unpin it } if (victim == dentry) { - inode->i_ctime = inode->i_mtime = - current_time(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); @@ -335,7 +594,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_atime = root->i_mtime = root->i_ctime = current_time(root); + simple_inode_init_ts(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; @@ -391,7 +650,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inc_nlink(inode); ihold(inode); dget(dentry); @@ -425,7 +685,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); dput(dentry); return 0; @@ -444,6 +705,32 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL(simple_rmdir); +/** + * simple_rename_timestamp - update the various inode timestamps for rename + * @old_dir: old parent directory + * @old_dentry: dentry that is being renamed + * @new_dir: new parent directory + * @new_dentry: target for rename + * + * POSIX mandates that the old and new parent directories have their ctime and + * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have + * their ctime updated. + */ +void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *newino = d_inode(new_dentry); + + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); + if (new_dir != old_dir) + inode_set_mtime_to_ts(new_dir, + inode_set_ctime_current(new_dir)); + inode_set_ctime_current(d_inode(old_dentry)); + if (newino) + inode_set_ctime_current(newino); +} +EXPORT_SYMBOL_GPL(simple_rename_timestamp); + int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -459,11 +746,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, inc_nlink(old_dir); } } - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - d_inode(old_dentry)->i_ctime = - d_inode(new_dentry)->i_ctime = current_time(old_dir); - + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); @@ -472,7 +755,6 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) @@ -495,9 +777,7 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, inc_nlink(new_dir); } - old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = - new_dir->i_mtime = inode->i_ctime = current_time(old_dir); - + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL(simple_rename); @@ -548,21 +828,20 @@ int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { - struct page *page; - pgoff_t index; + struct folio *folio; - index = pos >> PAGE_SHIFT; + folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + *pagep = &folio->page; - *pagep = page; + if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { + size_t from = offset_in_folio(folio, pos); - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user_segments(page, 0, from, from + len, PAGE_SIZE); + folio_zero_segments(folio, 0, from, + from + len, folio_size(folio)); } return 0; } @@ -594,17 +873,18 @@ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page)) { + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio)) { if (copied < len) { - unsigned from = pos & (PAGE_SIZE - 1); + size_t from = offset_in_folio(folio, pos); - zero_user(page, from + copied, len - copied); + folio_zero_range(folio, from + copied, len - copied); } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size @@ -613,9 +893,9 @@ static int simple_write_end(struct file *file, struct address_space *mapping, if (last_pos > inode->i_size) i_size_write(inode, last_pos); - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -659,7 +939,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); @@ -685,7 +965,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, goto out; } inode->i_mode = S_IFREG | files->mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); @@ -1041,6 +1321,47 @@ ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, EXPORT_SYMBOL_GPL(simple_attr_write_signed); /** + * generic_encode_ino32_fh - generic export_operations->encode_fh function + * @inode: the object to encode + * @fh: where to store the file handle fragment + * @max_len: maximum length to store there (in 4 byte units) + * @parent: parent directory inode, if wanted + * + * This generic encode_fh function assumes that the 32 inode number + * is suitable for locating an inode, and that the generation number + * can be used to check that it is still valid. It places them in the + * filehandle fragment where export_decode_fh expects to find them. + */ +int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent) +{ + struct fid *fid = (void *)fh; + int len = *max_len; + int type = FILEID_INO32_GEN; + + if (parent && (len < 4)) { + *max_len = 4; + return FILEID_INVALID; + } else if (len < 2) { + *max_len = 2; + return FILEID_INVALID; + } + + len = 2; + fid->i32.ino = inode->i_ino; + fid->i32.gen = inode->i_generation; + if (parent) { + fid->i32.parent_ino = parent->i_ino; + fid->i32.parent_gen = parent->i_generation; + len = 4; + type = FILEID_INO32_GEN_PARENT; + } + *max_len = len; + return type; +} +EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); + +/** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert @@ -1253,7 +1574,7 @@ struct inode *alloc_anon_inode(struct super_block *s) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); @@ -1269,7 +1590,7 @@ EXPORT_SYMBOL(alloc_anon_inode); * All arguments are ignored and it just returns -EINVAL. */ int -simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, +simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; @@ -1315,7 +1636,7 @@ static int empty_dir_getattr(struct mnt_idmap *idmap, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } @@ -1381,16 +1702,6 @@ bool is_empty_dir_inode(struct inode *inode) } #if IS_ENABLED(CONFIG_UNICODE) -/* - * Determine if the name of a dentry should be casefolded. - * - * Return: if names will need casefolding - */ -static bool needs_casefold(const struct inode *dir) -{ - return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; -} - /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against @@ -1411,7 +1722,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, char strbuf[DNAME_INLINE_LEN]; int ret; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) goto fallback; /* * If the dentry name is stored in-line, then it may be concurrently @@ -1453,7 +1764,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) const struct unicode_map *um = sb->s_encoding; int ret = 0; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); @@ -1646,6 +1957,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ + iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; @@ -1654,3 +1966,20 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, return direct_written + buffered_written; } EXPORT_SYMBOL_GPL(direct_write_fallback); + +/** + * simple_inode_init_ts - initialize the timestamps for a new inode + * @inode: inode to be initialized + * + * When a new inode is created, most filesystems set the timestamps to the + * current time. Add a helper to do this. + */ +struct timespec64 simple_inode_init_ts(struct inode *inode) +{ + struct timespec64 ts = inode_set_ctime_current(inode); + + inode_set_atime_to_ts(inode, ts); + inode_set_mtime_to_ts(inode, ts); + return ts; +} +EXPORT_SYMBOL(simple_inode_init_ts); diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 1d9488cf0534..87a0f207df0b 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -276,6 +276,9 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, { struct nsm_handle *new; + if (!hostname) + return NULL; + new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); if (unlikely(new == NULL)) return NULL; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 22d3ff3818f5..81be07c1d3d1 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -24,7 +24,6 @@ #include <linux/uio.h> #include <linux/smp.h> #include <linux/mutex.h> -#include <linux/kthread.h> #include <linux/freezer.h> #include <linux/inetdevice.h> @@ -45,7 +44,6 @@ #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) -#define ALLOWED_SIGS (sigmask(SIGKILL)) static struct svc_program nlmsvc_program; @@ -57,6 +55,12 @@ static unsigned int nlmsvc_users; static struct svc_serv *nlmsvc_serv; unsigned long nlmsvc_timeout; +static void nlmsvc_request_retry(struct timer_list *tl) +{ + svc_wake_up(nlmsvc_serv); +} +DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry); + unsigned int lockd_net_id; /* @@ -111,26 +115,12 @@ static void set_grace_period(struct net *net) schedule_delayed_work(&ln->grace_period_end, grace_period); } -static void restart_grace(void) -{ - if (nlmsvc_ops) { - struct net *net = &init_net; - struct lockd_net *ln = net_generic(net, lockd_net_id); - - cancel_delayed_work_sync(&ln->grace_period_end); - locks_end_grace(&ln->lockd_manager); - nlmsvc_invalidate_all(); - set_grace_period(net); - } -} - /* * This is the lockd kernel thread */ static int lockd(void *vrqstp) { - int err = 0; struct svc_rqst *rqstp = vrqstp; struct net *net = &init_net; struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -138,43 +128,19 @@ lockd(void *vrqstp) /* try_to_freeze() is called from svc_recv() */ set_freezable(); - /* Allow SIGKILL to tell lockd to drop all of its locks */ - allow_signal(SIGKILL); - dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. */ - while (!kthread_should_stop()) { - long timeout = MAX_SCHEDULE_TIMEOUT; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - + while (!svc_thread_should_stop(rqstp)) { /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; - if (signalled()) { - flush_signals(current); - restart_grace(); - continue; - } - - timeout = nlmsvc_retry_blocked(); - - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - err = svc_recv(rqstp, timeout); - if (err == -EAGAIN || err == -EINTR) - continue; - dprintk("lockd: request from %s\n", - svc_print_addr(rqstp, buf, sizeof(buf))); - - svc_process(rqstp); + nlmsvc_retry_blocked(rqstp); + svc_recv(rqstp); } - flush_signals(current); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -406,7 +372,10 @@ static void lockd_put(void) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif + svc_get(nlmsvc_serv); svc_set_num_threads(nlmsvc_serv, NULL, 0); + svc_put(nlmsvc_serv); + timer_delete_sync(&nlmsvc_retry); nlmsvc_serv = NULL; dprintk("lockd_down: service destroyed\n"); } @@ -538,7 +507,7 @@ static inline int is_callback(u32 proc) } -static int lockd_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status lockd_authenticate(struct svc_rqst *rqstp) { rqstp->rq_client = NULL; switch (rqstp->rq_authop->flavour) { diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c43ccdf28ed9..2dc10900ad1c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -30,7 +30,6 @@ #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> -#include <linux/kthread.h> #include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -131,12 +130,14 @@ static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when) static inline void nlmsvc_remove_block(struct nlm_block *block) { + spin_lock(&nlm_blocked_lock); if (!list_empty(&block->b_list)) { - spin_lock(&nlm_blocked_lock); list_del_init(&block->b_list); spin_unlock(&nlm_blocked_lock); nlmsvc_release_block(block); + return; } + spin_unlock(&nlm_blocked_lock); } /* @@ -152,6 +153,7 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) file, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, lock->fl.fl_type); + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { fl = &block->b_call->a_args.lock.fl; dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", @@ -161,9 +163,11 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) nlmdbg_cookie2a(&block->b_call->a_args.cookie)); if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) { kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); return block; } } + spin_unlock(&nlm_blocked_lock); return NULL; } @@ -185,16 +189,19 @@ nlmsvc_find_block(struct nlm_cookie *cookie) { struct nlm_block *block; + spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)) goto found; } + spin_unlock(&nlm_blocked_lock); return NULL; found: dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block); kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); return block; } @@ -317,6 +324,7 @@ void nlmsvc_traverse_blocks(struct nlm_host *host, restart: mutex_lock(&file->f_mutex); + spin_lock(&nlm_blocked_lock); list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) { if (!match(block->b_host, host)) continue; @@ -325,11 +333,13 @@ restart: if (list_empty(&block->b_list)) continue; kref_get(&block->b_count); + spin_unlock(&nlm_blocked_lock); mutex_unlock(&file->f_mutex); nlmsvc_unlink_block(block); nlmsvc_release_block(block); goto restart; } + spin_unlock(&nlm_blocked_lock); mutex_unlock(&file->f_mutex); } @@ -470,9 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct inode *inode = nlmsvc_file_inode(file); -#endif struct nlm_block *block = NULL; int error; int mode; @@ -486,7 +494,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (nlmsvc_file_file(file)->f_op->lock) { + if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) { async_block = wait; wait = 0; } @@ -532,6 +540,25 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + spin_lock(&nlm_blocked_lock); + /* + * If this is a lock request for an already pending + * lock request we return nlm_lck_blocked without calling + * vfs_lock_file() again. Otherwise we have two pending + * requests on the underlaying ->lock() implementation but + * only one nlm_block to being granted by lm_grant(). + */ + if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) && + !list_empty(&block->b_list)) { + spin_unlock(&nlm_blocked_lock); + ret = nlm_lck_blocked; + goto out; + } + + /* Append to list of blocked */ + nlmsvc_insert_block_locked(block, NLM_NEVER); + spin_unlock(&nlm_blocked_lock); + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; mode = lock_to_openmode(&lock->fl); @@ -541,16 +568,12 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, dprintk("lockd: vfs_lock_file returned %d\n", error); switch (error) { case 0: + nlmsvc_remove_block(block); ret = nlm_granted; goto out; case -EAGAIN: - /* - * If this is a blocking request for an - * already pending lock request then we need - * to put it back on lockd's block list - */ - if (wait) - break; + if (!wait) + nlmsvc_remove_block(block); ret = async_block ? nlm_lck_blocked : nlm_lck_denied; goto out; case FILE_LOCK_DEFERRED: @@ -561,17 +584,16 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlmsvc_defer_lock_rqst(rqstp, block); goto out; case -EDEADLK: + nlmsvc_remove_block(block); ret = nlm_deadlock; goto out; default: /* includes ENOLCK */ + nlmsvc_remove_block(block); ret = nlm_lck_denied_nolocks; goto out; } ret = nlm_lck_blocked; - - /* Append to list of blocked */ - nlmsvc_insert_block(block, NLM_NEVER); out: mutex_unlock(&file->f_mutex); nlmsvc_release_block(block); @@ -1008,14 +1030,14 @@ retry_deferred_block(struct nlm_block *block) * picks up locks that can be granted, or grant notifications that must * be retransmitted. */ -unsigned long -nlmsvc_retry_blocked(void) +void +nlmsvc_retry_blocked(struct svc_rqst *rqstp) { unsigned long timeout = MAX_SCHEDULE_TIMEOUT; struct nlm_block *block; spin_lock(&nlm_blocked_lock); - while (!list_empty(&nlm_blocked) && !kthread_should_stop()) { + while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) { block = list_entry(nlm_blocked.next, struct nlm_block, b_list); if (block->b_when == NLM_NEVER) @@ -1038,5 +1060,6 @@ nlmsvc_retry_blocked(void) } spin_unlock(&nlm_blocked_lock); - return timeout; + if (timeout < MAX_SCHEDULE_TIMEOUT) + mod_timer(&nlmsvc_retry, jiffies + timeout); } diff --git a/fs/locks.c b/fs/locks.c index df8b26a42524..46d88b9e222c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -167,8 +167,8 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); */ static DEFINE_SPINLOCK(blocked_lock_lock); -static struct kmem_cache *flctx_cache __read_mostly; -static struct kmem_cache *filelock_cache __read_mostly; +static struct kmem_cache *flctx_cache __ro_after_init; +static struct kmem_cache *filelock_cache __ro_after_init; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) @@ -438,7 +438,7 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) fl->fl_end = OFFSET_MAX; } -static int assign_type(struct file_lock *fl, long type) +static int assign_type(struct file_lock *fl, int type) { switch (type) { case F_RDLCK: @@ -549,7 +549,7 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, long type, struct file_lock *fl) +static int lease_init(struct file *filp, int type, struct file_lock *fl) { if (assign_type(fl, type) != 0) return -EINVAL; @@ -567,7 +567,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, long type) +static struct file_lock *lease_alloc(struct file *filp, int type) { struct file_lock *fl = locks_alloc_lock(); int error = -ENOMEM; @@ -868,6 +868,21 @@ static bool posix_locks_conflict(struct file_lock *caller_fl, return locks_conflict(caller_fl, sys_fl); } +/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK + * path so checks for additional GETLK-specific things like F_UNLCK. + */ +static bool posix_test_locks_conflict(struct file_lock *caller_fl, + struct file_lock *sys_fl) +{ + /* F_UNLCK checks any locks on the same fd. */ + if (caller_fl->fl_type == F_UNLCK) { + if (!posix_same_owner(caller_fl, sys_fl)) + return false; + return locks_overlap(caller_fl, sys_fl); + } + return posix_locks_conflict(caller_fl, sys_fl); +} + /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific * checking before calling the locks_conflict(). */ @@ -901,7 +916,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { - if (!posix_locks_conflict(fl, cfl)) + if (!posix_test_locks_conflict(fl, cfl)) continue; if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { @@ -1301,6 +1316,7 @@ retry: out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); + trace_posix_lock_inode(inode, request, error); /* * Free any unused locks. */ @@ -1309,7 +1325,6 @@ retry: if (new_fl2) locks_free_lock(new_fl2); locks_dispose_list(&dispose); - trace_posix_lock_inode(inode, request, error); return error; } @@ -1666,7 +1681,7 @@ int fcntl_getlease(struct file *filp) * conflict with the lease we're trying to set. */ static int -check_conflicting_open(struct file *filp, const long arg, int flags) +check_conflicting_open(struct file *filp, const int arg, int flags) { struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; @@ -1701,7 +1716,7 @@ check_conflicting_open(struct file *filp, const long arg, int flags) } static int -generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) +generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; struct inode *inode = file_inode(filp); @@ -1729,13 +1744,6 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr if (is_deleg && !inode_trylock(inode)) return -EAGAIN; - if (is_deleg && arg == F_WRLCK) { - /* Write delegations are not currently supported: */ - inode_unlock(inode); - WARN_ON_ONCE(1); - return -EINVAL; - } - percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1859,7 +1867,7 @@ static int generic_delete_lease(struct file *filp, void *owner) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp, +int generic_setlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct inode *inode = file_inode(filp); @@ -1906,7 +1914,7 @@ lease_notifier_chain_init(void) } static inline void -setlease_notifier(long arg, struct file_lock *lease) +setlease_notifier(int arg, struct file_lock *lease) { if (arg != F_UNLCK) srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); @@ -1942,7 +1950,7 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier); * may be NULL if the lm_setup operation doesn't require it. */ int -vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) +vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) { if (lease) setlease_notifier(arg, *lease); @@ -1953,7 +1961,7 @@ vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) { struct file_lock *fl; struct fasync_struct *new; @@ -1988,7 +1996,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) * Note that you also need to call %F_SETSIG to * receive a signal when the lease is broken. */ -int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); @@ -2136,7 +2144,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); * @fl: The file_lock who's fl_pid should be translated * @ns: The namespace into which the pid should be translated * - * Used to tranlate a fl_pid into a namespace virtual pid number + * Used to translate a fl_pid into a namespace virtual pid number */ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) { @@ -2207,7 +2215,8 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) if (fl == NULL) return -ENOMEM; error = -EINVAL; - if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) + if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK + && flock->l_type != F_WRLCK) goto out; error = flock_to_posix_lock(filp, fl, flock); @@ -2255,11 +2264,13 @@ out: * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * lm_grant is set. Callers expecting ->lock() to return asynchronously - * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) - * the request is for a blocking lock. When ->lock() does return asynchronously, - * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock - * request completes. + * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations + * flags need to be set. + * + * Callers expecting ->lock() to return asynchronously will only use F_SETLK, + * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a + * blocking lock. When ->lock() does return asynchronously, it must return + * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes. * If the request is for non-blocking lock the file system should return * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine * with the result. If the request timed out the callback routine will return a @@ -2414,7 +2425,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) return -ENOMEM; error = -EINVAL; - if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) + if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK + && flock->l_type != F_WRLCK) goto out; error = flock64_to_posix_lock(filp, fl, flock); diff --git a/fs/mbcache.c b/fs/mbcache.c index 2a4b8b549e93..82aa7a35db26 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -37,7 +37,7 @@ struct mb_cache { struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; - struct shrinker c_shrink; + struct shrinker *c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; @@ -293,8 +293,7 @@ EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return cache->c_entry_count; } @@ -333,8 +332,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return mb_cache_shrink(cache, sc->nr_to_scan); } @@ -377,15 +375,19 @@ struct mb_cache *mb_cache_create(int bucket_bits) for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); - cache->c_shrink.count_objects = mb_cache_count; - cache->c_shrink.scan_objects = mb_cache_scan; - cache->c_shrink.seeks = DEFAULT_SEEKS; - if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { + cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker"); + if (!cache->c_shrink) { kfree(cache->c_hash); kfree(cache); goto err_out; } + cache->c_shrink->count_objects = mb_cache_count; + cache->c_shrink->scan_objects = mb_cache_scan; + cache->c_shrink->private_data = cache; + + shrinker_register(cache->c_shrink); + INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; @@ -406,7 +408,7 @@ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; - unregister_shrinker(&cache->c_shrink); + shrinker_free(cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index de2003974ff0..90ddfad2a75e 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -2,6 +2,7 @@ config MINIX_FS tristate "Minix file system support" depends on BLOCK + select BUFFER_HEAD help Minix is a simple operating system used in many classes about OS's. The minix file system (method to organize files on a hard disk diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 870207ba23f1..7da66ca184f4 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode) } inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); insert_inode_hash(inode); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index bf9858f76b6a..62c313fc9a49 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -281,7 +281,7 @@ got_it: de->inode = inode->i_ino; } dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: @@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) else de->inode = 0; dir_commit_chunk(page, pos, len); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); } @@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, else de->inode = inode->i_ino; dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e9fbb5303a22..f8af6c3ae336 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -501,10 +501,8 @@ static struct inode *V1_minix_iget(struct inode *inode) i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime(inode, raw_inode->i_time, 0))); inode->i_blocks = 0; for (i = 0; i < 9; i++) minix_inode->u.i1_data[i] = raw_inode->i_zone[i]; @@ -541,12 +539,9 @@ static struct inode *V2_minix_iget(struct inode *inode) i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; - inode->i_mtime.tv_sec = raw_inode->i_mtime; - inode->i_atime.tv_sec = raw_inode->i_atime; - inode->i_ctime.tv_sec = raw_inode->i_ctime; - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; + inode_set_mtime(inode, raw_inode->i_mtime, 0); + inode_set_atime(inode, raw_inode->i_atime, 0); + inode_set_ctime(inode, raw_inode->i_ctime, 0); inode->i_blocks = 0; for (i = 0; i < 10; i++) minix_inode->u.i2_data[i] = raw_inode->i_zone[i]; @@ -593,7 +588,7 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode) raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); raw_inode->i_nlinks = inode->i_nlink; raw_inode->i_size = inode->i_size; - raw_inode->i_time = inode->i_mtime.tv_sec; + raw_inode->i_time = inode_get_mtime_sec(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); else for (i = 0; i < 9; i++) @@ -620,9 +615,9 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); raw_inode->i_nlinks = inode->i_nlink; raw_inode->i_size = inode->i_size; - raw_inode->i_mtime = inode->i_mtime.tv_sec; - raw_inode->i_atime = inode->i_atime.tv_sec; - raw_inode->i_ctime = inode->i_ctime.tv_sec; + raw_inode->i_mtime = inode_get_mtime_sec(inode); + raw_inode->i_atime = inode_get_atime_sec(inode); + raw_inode->i_ctime = inode_get_ctime_sec(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); else for (i = 0; i < 10; i++) @@ -660,7 +655,7 @@ int minix_getattr(struct mnt_idmap *idmap, const struct path *path, struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 446148792f41..dad131e30c05 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -131,7 +131,7 @@ static inline int splice_branch(struct inode *inode, /* We are done with atomic stuff, now do the rest of housekeeping */ - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -350,7 +350,7 @@ do_indirects: } first_whole++; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 956d5183828d..114084d5636a 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -98,7 +98,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); return add_nondir(dentry, inode); @@ -154,7 +154,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) if (err) return err; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); return 0; } @@ -218,7 +218,7 @@ static int minix_rename(struct mnt_idmap *idmap, put_page(new_page); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 4905665c47d0..57d1dedf3f8f 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -256,6 +256,7 @@ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) return idmap; } +EXPORT_SYMBOL_GPL(mnt_idmap_get); /** * mnt_idmap_put - put a reference to an idmapping @@ -271,3 +272,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap) kfree(idmap); } } +EXPORT_SYMBOL_GPL(mnt_idmap_put); diff --git a/fs/mpage.c b/fs/mpage.c index 242e213ee064..ffb064ed9d04 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -119,8 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, folio_mark_uptodate(folio); return; } - create_empty_buffers(&folio->page, i_blocksize(inode), 0); - head = folio_buffers(folio); + head = create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; diff --git a/fs/namei.c b/fs/namei.c index e56ff39a79bc..71c13b2990b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty) } } - result->refcnt = 1; + atomic_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { if (empty) @@ -249,7 +249,7 @@ getname_kernel(const char * filename) memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->refcnt = 1; + atomic_set(&result->refcnt, 1); audit_getname(result); return result; @@ -261,9 +261,10 @@ void putname(struct filename *name) if (IS_ERR(name)) return; - BUG_ON(name->refcnt <= 0); + if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) + return; - if (--name->refcnt > 0) + if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { @@ -643,6 +644,8 @@ static bool nd_alloc_stack(struct nameidata *nd) /** * path_connected - Verify that a dentry is below mnt.mnt_root + * @mnt: The mountpoint to check. + * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. @@ -1083,6 +1086,7 @@ fs_initcall(init_fs_namei_sysctls); /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data + * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is @@ -2890,7 +2894,7 @@ int path_pts(struct path *path) dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); - if (!child) + if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; @@ -3101,25 +3105,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) EXPORT_SYMBOL(unlock_rename); /** - * mode_strip_umask - handle vfs umask stripping - * @dir: parent directory of the new inode - * @mode: mode of the new inode to be created in @dir - * - * Umask stripping depends on whether or not the filesystem supports POSIX - * ACLs. If the filesystem doesn't support it umask stripping is done directly - * in here. If the filesystem does support POSIX ACLs umask stripping is - * deferred until the filesystem calls posix_acl_create(). - * - * Returns: mode - */ -static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) -{ - if (!IS_POSIXACL(dir)) - mode &= ~current_umask(); - return mode; -} - -/** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode @@ -3532,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd, if (likely(dentry)) goto finish_lookup; - BUG_ON(nd->flags & LOOKUP_RCU); + if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) + return ERR_PTR(-ECHILD); } else { /* create side of things */ if (nd->flags & LOOKUP_RCU) { @@ -3799,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + if (unlikely(file->f_mode & FMODE_OPENED)) + fput(file); + else + release_empty_file(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -4383,11 +4372,9 @@ retry_deleg: if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (last.name[last.len]) + if (last.name[last.len] || d_is_negative(dentry)) goto slashes; inode = dentry->d_inode; - if (d_is_negative(dentry)) - goto slashes; ihold(inode); error = security_path_unlink(&path, dentry); if (error) diff --git a/fs/namespace.c b/fs/namespace.c index e157efc54023..fbf0e596fcd3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -39,10 +39,10 @@ /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = 100000; -static unsigned int m_hash_mask __read_mostly; -static unsigned int m_hash_shift __read_mostly; -static unsigned int mp_hash_mask __read_mostly; -static unsigned int mp_hash_shift __read_mostly; +static unsigned int m_hash_mask __ro_after_init; +static unsigned int m_hash_shift __ro_after_init; +static unsigned int mp_hash_mask __ro_after_init; +static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) @@ -68,9 +68,9 @@ static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); -static struct hlist_head *mount_hashtable __read_mostly; -static struct hlist_head *mountpoint_hashtable __read_mostly; -static struct kmem_cache *mnt_cache __read_mostly; +static struct hlist_head *mount_hashtable __ro_after_init; +static struct hlist_head *mountpoint_hashtable __ro_after_init; +static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ @@ -86,7 +86,7 @@ struct mount_kattr { }; /* /sys/fs */ -struct kobject *fs_kobj; +struct kobject *fs_kobj __ro_after_init; EXPORT_SYMBOL_GPL(fs_kobj); /* @@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt) * can determine when writes are able to occur to a filesystem. */ /** - * __mnt_want_write - get write access to a mount without freeze protection + * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being - * frozen. When the write operation is finished, __mnt_drop_write() must be + * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ -int __mnt_want_write(struct vfsmount *m) +int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m) return ret; } +EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount @@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m) int ret; sb_start_write(m->mnt_sb); - ret = __mnt_want_write(m); + ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; @@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * __mnt_want_write_file - get write access to a file's mount + * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but if the file is already open for writing it + * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be - * paired with __mnt_drop_write_file. + * paired with mnt_put_write_access_file. */ -int __mnt_want_write_file(struct file *file) +int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* @@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file) return -EROFS; return 0; } - return __mnt_want_write(file->f_path.mnt); + return mnt_get_write_access(file->f_path.mnt); } /** @@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file) int ret; sb_start_write(file_inode(file)->i_sb); - ret = __mnt_want_write_file(file); + ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; @@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file) EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * __mnt_drop_write - give up write access to a mount + * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * __mnt_want_write() call above. + * mnt_get_write_access() call above. */ -void __mnt_drop_write(struct vfsmount *mnt) +void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } +EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount @@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt) */ void mnt_drop_write(struct vfsmount *mnt) { - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); -void __mnt_drop_write_file(struct file *file) +void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) - __mnt_drop_write(file->f_path.mnt); + mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); @@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); - /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + /* avoid cacheline pingpong */ if (unlikely(m->mnt_expiry_mark)) - m->mnt_expiry_mark = 0; + WRITE_ONCE(m->mnt_expiry_mark, 0); mntput_no_expire(m); } } diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 3404707ddbe7..2cd3ccf4c439 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) xas_for_each(&xas, folio, last_page) { loff_t pg_end; bool pg_failed = false; + bool folio_started; if (xas_retry(&xas, folio)) continue; pg_end = folio_pos(folio) + folio_size(folio) - 1; + folio_started = false; for (;;) { loff_t sreq_end; @@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { folio_start_fscache(folio); + folio_started = true; + } pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index b6fc169be1b1..01ac733a6320 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -125,7 +125,7 @@ config PNFS_BLOCK config PNFS_FLEXFILE_LAYOUT tristate - depends on NFS_V4_1 && NFS_V3 + depends on NFS_V4_1 default NFS_V4 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN @@ -209,8 +209,6 @@ config NFS_DISABLE_UDP_SUPPORT config NFS_V4_2_READ_PLUS bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation" depends on NFS_V4_2 - default n + default y help - This is intended for developers only. The READ_PLUS operation has - been shown to have issues under specific conditions and should not - be used in production. + Choose Y here to enable use of the NFS v4.2 READ_PLUS operation. diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 716bc75e9ed2..b4294a8aa2d4 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -108,7 +108,7 @@ struct pnfs_block_dev { struct pnfs_block_dev *children; u64 chunk_size; - struct block_device *bdev; + struct bdev_handle *bdev_handle; u64 disk_offset; u64 pr_key; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 70f5563a8e81..f318a05a80e1 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev) } else { if (dev->pr_registered) { const struct pr_ops *ops = - dev->bdev->bd_disk->fops->pr_ops; + dev->bdev_handle->bdev->bd_disk->fops->pr_ops; int error; - error = ops->pr_register(dev->bdev, dev->pr_key, 0, - false); + error = ops->pr_register(dev->bdev_handle->bdev, + dev->pr_key, 0, false); if (error) pr_err("failed to unregister PR key.\n"); } - if (dev->bdev) - blkdev_put(dev->bdev, NULL); + if (dev->bdev_handle) + bdev_release(dev->bdev_handle); } } @@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, map->start = dev->start; map->len = dev->len; map->disk_offset = dev->disk_offset; - map->bdev = dev->bdev; + map->bdev = dev->bdev_handle->bdev; return true; } @@ -236,28 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct block_device *bdev; + struct bdev_handle *bdev_handle; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, - NULL); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_handle)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return PTR_ERR(bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle)); + return PTR_ERR(bdev_handle); } - d->bdev = bdev; - - - d->len = bdev_nr_bytes(d->bdev); + d->bdev_handle = bdev_handle; + d->len = bdev_nr_bytes(bdev_handle->bdev); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", - d->bdev->bd_disk->disk_name); + bdev_handle->bdev->bd_disk->disk_name); return 0; } @@ -302,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -static struct block_device * +static struct bdev_handle * bl_open_path(struct pnfs_block_volume *v, const char *prefix) { - struct block_device *bdev; + struct bdev_handle *bdev_handle; const char *devname; devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", @@ -313,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, - NULL); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_handle)) { pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(bdev)); + devname, PTR_ERR(bdev_handle)); } kfree(devname); - return bdev; + return bdev_handle; } static int @@ -329,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct block_device *bdev; + struct bdev_handle *bdev_handle; const struct pr_ops *ops; int error; @@ -342,32 +340,32 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, * On other distributions like Debian, the default SCSI by-id path will * point to the dm-multipath device if one exists. */ - bdev = bl_open_path(v, "dm-uuid-mpath-0x"); - if (IS_ERR(bdev)) - bdev = bl_open_path(v, "wwn-0x"); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - d->bdev = bdev; - - d->len = bdev_nr_bytes(d->bdev); + bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x"); + if (IS_ERR(bdev_handle)) + bdev_handle = bl_open_path(v, "wwn-0x"); + if (IS_ERR(bdev_handle)) + return PTR_ERR(bdev_handle); + d->bdev_handle = bdev_handle; + + d->len = bdev_nr_bytes(d->bdev_handle->bdev); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", - d->bdev->bd_disk->disk_name, d->pr_key); + d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key); - ops = d->bdev->bd_disk->fops->pr_ops; + ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: block device %s does not support reservations.", - d->bdev->bd_disk->disk_name); + d->bdev_handle->bdev->bd_disk->disk_name); error = -EINVAL; goto out_blkdev_put; } - error = ops->pr_register(d->bdev, 0, d->pr_key, true); + error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true); if (error) { pr_err("pNFS: failed to register key for block device %s.", - d->bdev->bd_disk->disk_name); + d->bdev_handle->bdev->bd_disk->disk_name); goto out_blkdev_put; } @@ -375,7 +373,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, NULL); + bdev_release(d->bdev_handle); return error; } @@ -404,7 +402,7 @@ bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->concat.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; @@ -433,7 +431,7 @@ bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, int ret, i; d->children = kcalloc(v->stripe.volumes_count, - sizeof(struct pnfs_block_dev), GFP_KERNEL); + sizeof(struct pnfs_block_dev), gfp_mask); if (!d->children) return -ENOMEM; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 456af7d230cf..4ffa1f469e90 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -74,72 +74,18 @@ out_err: static int nfs4_callback_svc(void *vrqstp) { - int err; struct svc_rqst *rqstp = vrqstp; set_freezable(); - while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - /* - * Listen for a request on the socket - */ - err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); - if (err == -EAGAIN || err == -EINTR) - continue; - svc_process(rqstp); - } + while (!svc_thread_should_stop(rqstp)) + svc_recv(rqstp); svc_exit_thread(rqstp); return 0; } #if defined(CONFIG_NFS_V4_1) -/* - * The callback service for NFSv4.1 callbacks - */ -static int -nfs41_callback_svc(void *vrqstp) -{ - struct svc_rqst *rqstp = vrqstp; - struct svc_serv *serv = rqstp->rq_server; - struct rpc_rqst *req; - int error; - DEFINE_WAIT(wq); - - set_freezable(); - - while (!kthread_freezable_should_stop(NULL)) { - - if (signal_pending(current)) - flush_signals(current); - - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); - spin_lock_bh(&serv->sv_cb_lock); - if (!list_empty(&serv->sv_cb_list)) { - req = list_first_entry(&serv->sv_cb_list, - struct rpc_rqst, rq_bc_list); - list_del(&req->rq_bc_list); - spin_unlock_bh(&serv->sv_cb_lock); - finish_wait(&serv->sv_cb_waitq, &wq); - dprintk("Invoking bc_svc_process()\n"); - error = bc_svc_process(serv, req, rqstp); - dprintk("bc_svc_process() returned w/ error code= %d\n", - error); - } else { - spin_unlock_bh(&serv->sv_cb_lock); - if (!kthread_should_stop()) - schedule(); - finish_wait(&serv->sv_cb_waitq, &wq); - } - } - - svc_exit_thread(rqstp); - return 0; -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -252,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) cb_info->users); threadfn = nfs4_callback_svc; -#if defined(CONFIG_NFS_V4_1) - if (minorversion) - threadfn = nfs41_callback_svc; -#else +#if !defined(CONFIG_NFS_V4_1) if (minorversion) return ERR_PTR(-ENOTSUPP); #endif @@ -387,7 +330,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound */ -static int nfs_callback_authenticate(struct svc_rqst *rqstp) +static enum svc_auth_status nfs_callback_authenticate(struct svc_rqst *rqstp) { rqstp->rq_auth_stat = rpc_autherr_badcred; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c1eda73254e1..96a4923080ae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -59,8 +59,8 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = inode->i_ctime; - res->mtime = inode->i_mtime; + res->ctime = inode_get_ctime(inode); + res->mtime = inode_get_mtime(inode); res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e4c5f193ed5e..44eca51b2808 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -517,6 +517,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, + .connect_timeout = cl_init->connect_timeout, + .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index cf7365581031..fa1a14def45c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -448,6 +448,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, delegation->cred = get_cred(cred); delegation->inode = inode; delegation->flags = 1<<NFS_DELEGATION_REFERENCED; + delegation->test_gen = 0; spin_lock_init(&delegation->lock); spin_lock(&clp->cl_lock); @@ -1294,6 +1295,8 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, struct inode *inode; const struct cred *cred; nfs4_stateid stateid; + unsigned long gen = ++server->delegation_gen; + restart: rcu_read_lock(); restart_locked: @@ -1303,7 +1306,8 @@ restart_locked: test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || test_bit(NFS_DELEGATION_TEST_EXPIRED, - &delegation->flags) == 0) + &delegation->flags) == 0 || + delegation->test_gen == gen) continue; inode = nfs_delegation_grab_inode(delegation); if (inode == NULL) @@ -1312,6 +1316,7 @@ restart_locked: cred = get_cred_rcu(delegation->cred); nfs4_stateid_copy(&stateid, &delegation->stateid); spin_unlock(&delegation->lock); + delegation->test_gen = gen; clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); rcu_read_unlock(); nfs_delegation_test_free_expired(inode, &stateid, cred); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 1c378992b7c0..a6f495d012cf 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -21,6 +21,7 @@ struct nfs_delegation { fmode_t type; unsigned long pagemod_limit; __u64 change_attr; + unsigned long test_gen; unsigned long flags; refcount_t refcount; spinlock_t lock; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8f3112e71a6a..13dffe4201e6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1089,6 +1089,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; + /* + * nfs_readdir_handle_cache_misses return force clear at + * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for + * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1 + * entries need be emitted here. + */ + if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) { + desc->eob = true; + break; + } + ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { @@ -1107,10 +1118,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) { - desc->eob = true; - break; - } } if (array->folio_is_eof) desc->eof = !desc->eob; @@ -2525,7 +2532,7 @@ EXPORT_SYMBOL_GPL(nfs_unlink); int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - struct page *page; + struct folio *folio; char *kaddr; struct iattr attr; unsigned int pathlen = strlen(symname); @@ -2540,24 +2547,24 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - page = alloc_page(GFP_USER); - if (!page) + folio = folio_alloc(GFP_USER, 0); + if (!folio) return -ENOMEM; - kaddr = page_address(page); + kaddr = folio_address(folio); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); trace_nfs_symlink_enter(dir, dentry); - error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); + error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); - __free_page(page); + folio_put(folio); return error; } @@ -2567,18 +2574,13 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0, - GFP_KERNEL)) { - SetPageUptodate(page); - unlock_page(page); - /* - * add_to_page_cache_lru() grabs an extra page refcount. - * Drop it here to avoid leaking this page later. - */ - put_page(page); - } else - __free_page(page); + if (filemap_add_folio(d_inode(dentry)->i_mapping, folio, 0, + GFP_KERNEL) == 0) { + folio_mark_uptodate(folio); + folio_unlock(folio); + } + folio_put(folio); return 0; } EXPORT_SYMBOL_GPL(nfs_symlink); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aaffaaa336cc..f6c74f424691 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, dreq->max_count = dreq_len; if (dreq->count > dreq_len) dreq->count = dreq_len; - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - else /* Clear outstanding error if this is EOF */ - dreq->error = 0; } + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) + dreq->error = hdr->error; } static void @@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } +static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, + struct nfs_page *req) +{ + loff_t offs = req_offset(req); + size_t req_start = (size_t)(offs - dreq->io_start); + + if (req_start < dreq->max_count) + dreq->max_count = req_start; + if (req_start < dreq->count) + dreq->count = req_start; +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -472,13 +482,33 @@ out: return result; } -static void nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_add_page_head(struct list_head *list, + struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + if (!list_empty(&head->wb_list) || !nfs_lock_request(head)) + return; + if (!list_empty(&head->wb_list)) { + nfs_unlock_request(head); + return; + } + list_add(&head->wb_list, list); + kref_get(&head->wb_kref); + kref_get(&head->wb_kref); +} + +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *req, *subreq; list_for_each_entry(req, list, wb_list) { - if (req->wb_head != req) + if (req->wb_head != req) { + nfs_direct_add_page_head(&req->wb_list, req); continue; + } subreq = req->wb_this_page; if (subreq == req) continue; @@ -492,7 +522,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode) nfs_release_request(subreq); } } while ((subreq = subreq->wb_this_page) != req); - nfs_join_page_group(req, inode); + nfs_join_page_group(req, cinfo, inode); } } @@ -510,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode, static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); - dreq->count = 0; - dreq->max_count = 0; - list_for_each_entry(req, &reqs, wb_list) - dreq->max_count += req->wb_bytes; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -531,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); /* Bump the transmission count */ req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_move_request(req, &failed); - spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + spin_lock(&dreq->lock); + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; - spin_unlock(&cinfo.inode->i_lock); + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } } if (put_dreq(dreq)) @@ -571,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; - dreq->max_count = 0; - dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; } else { status = dreq->error; @@ -583,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (status >= 0 && !nfs_write_match_verf(verf, req)) { + if (status < 0) { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } else if (!nfs_write_match_verf(verf, req)) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; /* * Despite the reboot, the write was successful, @@ -591,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) */ req->wb_nio = 0; nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else /* Error or match */ + } else nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -644,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) while (!list_empty(&reqs)) { req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); + nfs_direct_truncate_request(dreq, req); nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -693,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && + !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; @@ -737,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error) static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_page *req; + struct nfs_commit_info cinfo; trace_nfs_direct_write_reschedule_io(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (dreq->error == 0) { + if (dreq->error == 0) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.offset + hdr->args.count - - hdr->io_start; - } + set_bit(NFS_IOHDR_REDO, &hdr->flags); spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } } static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { @@ -776,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; trace_nfs_direct_write_schedule_iovec(dreq); @@ -819,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_lock_request(req); - if (!nfs_pageio_add_request(&desc, req)) { - result = desc.pg_error; - nfs_unlock_and_release_request(req); - break; - } pgbase = 0; bytes -= req_len; requested_bytes += req_len; pos += req_len; dreq->bytes_left -= req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + + nfs_lock_request(req); + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 6603b5cee029..714975e5c0db 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -7,14 +7,16 @@ * Resolves DNS hostnames into valid ip addresses */ -#ifdef CONFIG_NFS_USE_KERNEL_DNS - #include <linux/module.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/addr.h> -#include <linux/dns_resolver.h> + #include "dns_resolve.h" +#ifdef CONFIG_NFS_USE_KERNEL_DNS + +#include <linux/dns_resolver.h> + ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, struct sockaddr_storage *ss, size_t salen) { @@ -35,7 +37,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #else -#include <linux/module.h> #include <linux/hash.h> #include <linux/string.h> #include <linux/kmod.h> @@ -43,15 +44,12 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen, #include <linux/socket.h> #include <linux/seq_file.h> #include <linux/inet.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/addr.h> #include <linux/sunrpc/cache.h> #include <linux/sunrpc/svcauth.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" -#include "dns_resolve.h" #include "cache_lib.h" #include "netns.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 79b1b3fcd3fc..3f9768810427 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -200,7 +200,7 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe EXPORT_SYMBOL_GPL(nfs_file_splice_read); int -nfs_file_mmap(struct file * file, struct vm_area_struct * vma) +nfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); int status; diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index aed0748fd6ec..c7bb5da93307 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr { u32 stripe_count; u8 *stripe_indices; u32 ds_num; - struct nfs4_pnfs_ds *ds_list[]; + struct nfs4_pnfs_ds *ds_list[] __counted_by(ds_num); }; struct nfs4_filelayout_segment { diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7deb3cd76abe..ef817a0475ff 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: @@ -2519,9 +2520,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, return i; } -static int -ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { + struct pnfs_layout_hdr *lo; struct nfs4_flexfile_layout *ff_layout; const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; @@ -2532,11 +2533,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return -ENOMEM; spin_lock(&args->inode->i_lock); - ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); - args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], - dev_count, - NFS4_FF_OP_LAYOUTSTATS); + lo = NFS_I(args->inode)->layout; + if (lo && pnfs_layout_is_valid(lo)) { + ff_layout = FF_LAYOUT_FROM_HDR(lo); + args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &args->devinfo[0], dev_count, + NFS4_FF_OP_LAYOUTSTATS); + } else + args->num_dev = 0; spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 354a031c69b1..f84b3fb0dddd 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment { u64 stripe_unit; u32 flags; u32 mirror_array_cnt; - struct nfs4_ff_layout_mirror *mirror_array[]; + struct nfs4_ff_layout_mirror *mirror_array[] __counted_by(mirror_array_cnt); }; struct nfs4_flexfile_layout { diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 8c35d88a84b1..b05717fe0d4e 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode) &auxdata, /* aux_data */ sizeof(auxdata), i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index e1706e736c64..5407ab8c8783 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -114,10 +114,10 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata * struct inode *inode) { memset(auxdata, 0, sizeof(*auxdata)); - auxdata->mtime_sec = inode->i_mtime.tv_sec; - auxdata->mtime_nsec = inode->i_mtime.tv_nsec; - auxdata->ctime_sec = inode->i_ctime.tv_sec; - auxdata->ctime_nsec = inode->i_ctime.tv_nsec; + auxdata->mtime_sec = inode_get_mtime(inode).tv_sec; + auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec; + auxdata->ctime_sec = inode_get_ctime(inode).tv_sec; + auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec; if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4) auxdata->change_attr = inode_peek_iversion_raw(inode); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8172dd4135a1..ebb8d60e1152 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -512,9 +512,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else init_special_inode(inode, inode->i_mode, fattr->rdev); - memset(&inode->i_atime, 0, sizeof(inode->i_atime)); - memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); - memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + inode_set_atime(inode, 0, 0); + inode_set_mtime(inode, 0, 0); + inode_set_ctime(inode, 0, 0); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); @@ -527,15 +527,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -731,7 +731,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -742,14 +742,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (attr->ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -758,14 +758,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (attr->ia_valid & ATTR_MTIME_SET) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -912,7 +912,7 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); stat->change_cookie = inode_peek_iversion_raw(inode); stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; @@ -1444,18 +1444,18 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec64_equal(&ts, &fattr->pre_ctime)) { - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); } - ts = inode->i_mtime; + ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec64_equal(&ts, &fattr->pre_mtime)) { - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) @@ -1506,11 +1506,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_CHANGE; - ts = inode->i_mtime; + ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; @@ -1534,7 +1534,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_NLINK; - ts = inode->i_atime; + ts = inode_get_atime(inode); if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; @@ -1997,12 +1997,12 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - fattr->pre_ctime = inode->i_ctime; + fattr->pre_ctime = inode_get_ctime(inode); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - fattr->pre_mtime = inode->i_mtime; + fattr->pre_mtime = inode_get_mtime(inode); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -2184,13 +2184,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; @@ -2220,7 +2220,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) save_cache_validity & NFS_INO_INVALID_SIZE; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 913c09806c7f..9c9cf764f600 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -82,6 +82,8 @@ struct nfs_client_initdata { const struct rpc_timeout *timeparms; const struct cred *cred; struct xprtsec_parms xprtsec; + unsigned long connect_timeout; + unsigned long reconnect_timeout; }; /* @@ -493,6 +495,7 @@ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops; extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); +extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size); extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio, struct nfs_open_context *ctx, struct folio *folio); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 19d51ebf842c..e7494cdd957e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -215,7 +215,8 @@ nfs_namespace_getattr(struct mnt_idmap *idmap, if (NFS_FH(d_inode(path->dentry))->size != 0) return nfs_getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); return 0; } diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 5ba00610aede..0d3ce0460e35 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -18,7 +18,7 @@ struct nfs_subversion { const struct rpc_version *rpc_vers; /* NFS version information */ const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ const struct super_operations *sops; /* NFS Super operations */ - const struct xattr_handler **xattr; /* NFS xattr handlers */ + const struct xattr_handler * const *xattr; /* NFS xattr handlers */ struct list_head list; /* List of NFS versions */ }; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 05c3b4b2b3dd..c19093814296 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -949,7 +949,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_filename_inline(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; /* * The type (size and byte order) of nfscookie isn't defined in diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index eff3802c5e03..674c012868b1 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -86,6 +86,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; + unsigned long connect_timeout = ds_timeo * (ds_retrans + 1) * HZ / 10; struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, @@ -98,6 +99,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .timeparms = &ds_timeout, .cred = mds_srv->cred, .xprtsec = mds_clp->cl_xprtsec, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 4bf208a0a8e9..2de66e4e8280 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -543,9 +543,10 @@ out: } static int -nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, +nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio, unsigned int len, struct iattr *sattr) { + struct page *page = &folio->page; struct nfs3_createdata *data; struct dentry *d_alias; int status = -ENOMEM; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 3b0b650c9c5a..60f032be805a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1991,7 +1991,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, error = decode_inline_filename3(xdr, &entry->name, &entry->len); if (unlikely(error)) - return -EAGAIN; + return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN; error = decode_cookie3(xdr, &new_cookie); if (unlikely(error)) diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0fe5aacbcfdf..b59876b01a1e 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ * more? Need to consider not to pre-alloc too much for a compound. */ #define PNFS_LAYOUTSTATS_MAXDEV (4) +#define READ_PLUS_SCRATCH_SIZE (16) /* nfs4.2proc.c */ #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 49f78e23b34c..28704f924612 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -81,7 +81,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status == 0) { if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE); spin_unlock(&inode->i_lock); } status = nfs_post_op_update_inode_force_wcc(inode, @@ -471,8 +472,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, continue; } break; - } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && !args.sync) { - args.sync = true; + } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && + args.sync != res.synchronous) { + args.sync = res.synchronous; dst_exception.retry = 1; continue; } else if ((err == -ESTALE || diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 911f634ba3da..2ad66a8922f4 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -796,28 +796,9 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc); -static struct shrinker nfs4_xattr_cache_shrinker = { - .count_objects = nfs4_xattr_cache_count, - .scan_objects = nfs4_xattr_cache_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = DEFAULT_SEEKS, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_large_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = 1, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; +static struct shrinker *nfs4_xattr_cache_shrinker; +static struct shrinker *nfs4_xattr_entry_shrinker; +static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, @@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) struct nfs4_xattr_entry *entry; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); @@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; count = list_lru_shrink_count(lru, sc); @@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p) INIT_LIST_HEAD(&cache->dispose); } -static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, - struct list_lru *lru, const char *name) +typedef unsigned long (*count_objects_cb)(struct shrinker *s, + struct shrink_control *sc); +typedef unsigned long (*scan_objects_cb)(struct shrinker *s, + struct shrink_control *sc); + +static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, + struct list_lru *lru, const char *name, + count_objects_cb count, + scan_objects_cb scan, long batch, int seeks) { - int ret = 0; + int ret; - ret = register_shrinker(shrinker, name); - if (ret) + *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); + if (!*shrinker) + return -ENOMEM; + + ret = list_lru_init_memcg(lru, *shrinker); + if (ret) { + shrinker_free(*shrinker); return ret; + } - ret = list_lru_init_memcg(lru, shrinker); - if (ret) - unregister_shrinker(shrinker); + (*shrinker)->count_objects = count; + (*shrinker)->scan_objects = scan; + (*shrinker)->batch = batch; + (*shrinker)->seeks = seeks; + + shrinker_register(*shrinker); return ret; } @@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, struct list_lru *lru) { - unregister_shrinker(shrinker); + shrinker_free(shrinker); list_lru_destroy(lru); } @@ -1026,27 +1023,31 @@ int __init nfs4_xattr_cache_init(void) return -ENOMEM; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, - &nfs4_xattr_cache_lru, - "nfs-xattr_cache"); + &nfs4_xattr_cache_lru, "nfs-xattr_cache", + nfs4_xattr_cache_count, + nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); if (ret) goto out1; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, - &nfs4_xattr_entry_lru, - "nfs-xattr_entry"); + &nfs4_xattr_entry_lru, "nfs-xattr_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); if (ret) goto out2; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru, - "nfs-xattr_large_entry"); + "nfs-xattr_large_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, 1); if (!ret) return 0; - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); out2: - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); @@ -1056,11 +1057,11 @@ out1: void nfs4_xattr_cache_exit(void) { - nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 95234208dc9e..9e3ae53e2205 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -54,10 +54,16 @@ (1 /* data_content4 */ + \ 2 /* data_info4.di_offset */ + \ 1 /* data_info4.di_length */) +#define NFS42_READ_PLUS_HOLE_SEGMENT_SIZE \ + (1 /* data_content4 */ + \ + 2 /* data_info4.di_offset */ + \ + 2 /* data_info4.di_length */) +#define READ_PLUS_SEGMENT_SIZE_DIFF (NFS42_READ_PLUS_HOLE_SEGMENT_SIZE - \ + NFS42_READ_PLUS_DATA_SEGMENT_SIZE) #define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ 1 /* rpr_eof */ + \ 1 /* rpr_contents count */ + \ - NFS42_READ_PLUS_DATA_SEGMENT_SIZE) + NFS42_READ_PLUS_HOLE_SEGMENT_SIZE) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -617,8 +623,8 @@ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, encode_putfh(xdr, args->fh, &hdr); encode_read_plus(xdr, args, &hdr); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + hdr.replen - READ_PLUS_SEGMENT_SIZE_DIFF); encode_nops(&hdr); } @@ -1056,13 +1062,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) res->eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) - return status; + return 0; segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL); if (!segs) return -ENOMEM; - status = -EIO; for (i = 0; i < segments; i++) { status = decode_read_plus_segment(xdr, &segs[i]); if (status < 0) @@ -1428,7 +1433,7 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch)); + xdr_set_scratch_buffer(xdr, res->scratch, READ_PLUS_SCRATCH_SIZE); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4c9f8bd866ab..581698f1b7b2 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -209,6 +209,7 @@ struct nfs4_exception { struct inode *inode; nfs4_stateid *stateid; long timeout; + unsigned short retrans; unsigned char task_is_privileged : 1; unsigned char delay : 1, recovering : 1, @@ -315,7 +316,7 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); -extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern const struct xattr_handler * const nfs4_xattr_handlers[]; extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, @@ -328,7 +329,7 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); -extern int nfs4_proc_setlease(struct file *file, long arg, +extern int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); @@ -546,6 +547,7 @@ extern unsigned short max_session_slots; extern unsigned short max_session_cb_slots; extern unsigned short send_implementation_id; extern bool recover_lost_locks; +extern short nfs_delay_retrans; #define NFS4_CLIENT_ID_UNIQ_LEN (64) extern char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN]; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d9114a754db7..11e3a285594c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -232,6 +232,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + if (test_bit(NFS_CS_DS, &cl_init->init_flags)) + __set_bit(NFS_CS_DS, &clp->cl_flags); /* * Set up the connection to the server before we add add to the * global list. @@ -415,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) .net = old->cl_net, .servername = old->cl_hostname, }; + int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ? + clp->cl_max_connect : old->cl_max_connect; if (clp->cl_proto != old->cl_proto) return; @@ -428,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) xprt_args.addrlen = clp_salen; rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, &max_connect); } /** @@ -1007,6 +1011,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_DS, &cl_init.init_flags); + __set_bit(NFS_CS_PNFS, &cl_init.init_flags); + cl_init.max_connect = NFS_MAX_TRANSPORTS; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4aeadd6e1a6d..02788c3c85e5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -438,7 +438,7 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { return nfs4_proc_setlease(file, arg, lease, priv); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 832fa226b8f2..8a943fffaad5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -585,6 +585,21 @@ wait_on_recovery: return 0; } +/* + * Track the number of NFS4ERR_DELAY related retransmissions and return + * EAGAIN if the 'softerr' mount option is set, and we've exceeded the limit + * set by 'nfs_delay_retrans'. + */ +static int nfs4_exception_should_retrans(const struct nfs_server *server, + struct nfs4_exception *exception) +{ + if (server->flags & NFS_MOUNT_SOFTERR && nfs_delay_retrans >= 0) { + if (exception->retrans++ >= (unsigned short)nfs_delay_retrans) + return -EAGAIN; + } + return 0; +} + /* This is the error handling routine for processes that are allowed * to sleep. */ @@ -595,6 +610,11 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { + int ret2 = nfs4_exception_should_retrans(server, exception); + if (ret2 < 0) { + exception->retry = 0; + return ret2; + } ret = nfs4_delay(&exception->timeout, exception->interruptible); goto out_retry; @@ -623,6 +643,11 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { + int ret2 = nfs4_exception_should_retrans(server, exception); + if (ret2 < 0) { + exception->retry = 0; + return ret2; + } rpc_delay(task, nfs4_update_delay(&exception->timeout)); goto out_retry; } @@ -2703,8 +2728,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data, return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + struct nfs_fh *fh = &o_res->fh; + nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL); + if (o_arg->claim == NFS4_OPEN_CLAIM_FH) + fh = NFS_FH(d_inode(data->dentry)); + nfs4_proc_getattr(server, fh, o_res->f_attr, NULL); } return 0; } @@ -5007,9 +5036,10 @@ static void nfs4_free_createdata(struct nfs4_createdata *data) } static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr, + struct folio *folio, unsigned int len, struct iattr *sattr, struct nfs4_label *label) { + struct page *page = &folio->page; struct nfs4_createdata *data; int status = -ENAMETOOLONG; @@ -5034,7 +5064,7 @@ out: } static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, - struct page *page, unsigned int len, struct iattr *sattr) + struct folio *folio, unsigned int len, struct iattr *sattr) { struct nfs4_exception exception = { .interruptible = true, @@ -5045,7 +5075,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, label = nfs4_label_init_security(dir, dentry, sattr, &l); do { - err = _nfs4_proc_symlink(dir, dentry, page, len, sattr, label); + err = _nfs4_proc_symlink(dir, dentry, folio, len, sattr, label); trace_nfs4_symlink(dir, &dentry->d_name, err); err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); @@ -5438,18 +5468,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task, return false; } -static inline void nfs4_read_plus_scratch_free(struct nfs_pgio_header *hdr) -{ - if (hdr->res.scratch) { - kfree(hdr->res.scratch); - hdr->res.scratch = NULL; - } -} - static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - nfs4_read_plus_scratch_free(hdr); - if (!nfs4_sequence_done(task, &hdr->res.seq_res)) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) @@ -5469,8 +5489,7 @@ static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr, /* Note: We don't use READ_PLUS with pNFS yet */ if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) { msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; - hdr->res.scratch = kmalloc(32, GFP_KERNEL); - return hdr->res.scratch != NULL; + return nfs_read_alloc_scratch(hdr, READ_PLUS_SCRATCH_SIZE); } return false; } @@ -5629,7 +5648,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); - nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr); + nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr); } static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) @@ -5670,7 +5689,8 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); - nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg); + nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client, + NFS_SP4_MACH_CRED_COMMIT, clnt, msg); } static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args, @@ -7579,7 +7599,7 @@ static int nfs4_delete_lease(struct file *file, void **priv) return generic_setlease(file, F_UNLCK, NULL, priv); } -static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, void **priv) { struct inode *inode = file_inode(file); @@ -7597,7 +7617,7 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, return -EAGAIN; } -int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, +int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { switch (arg) { @@ -8798,6 +8818,8 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred, #ifdef CONFIG_NFS_V4_1_MIGRATION calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; #endif + if (test_bit(NFS_CS_DS, &clp->cl_flags)) + calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS; msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; @@ -8939,6 +8961,7 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); +try_again: /* Test connection for session trunking. Async exchange_id call */ task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt); if (IS_ERR(task)) @@ -8951,11 +8974,15 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, if (status == 0) rpc_clnt_xprt_switch_add_xprt(clnt, xprt); - else if (rpc_clnt_xprt_switch_has_addr(clnt, + else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt, (struct sockaddr *)&xprt->addr)) rpc_clnt_xprt_switch_remove_xprt(clnt, xprt); rpc_put_task(task); + if (status == -NFS4ERR_DELAY) { + ssleep(1); + goto try_again; + } } EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); @@ -9626,6 +9653,9 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, nfs4_sequence_free_slot(&lgp->res.seq_res); + exception->state = NULL; + exception->stateid = NULL; + switch (nfs4err) { case 0: goto out; @@ -9721,7 +9751,8 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, + struct nfs4_exception *exception) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); @@ -9741,13 +9772,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) RPC_TASK_MOVEABLE, }; struct pnfs_layout_segment *lseg = NULL; - struct nfs4_exception exception = { - .inode = inode, - .timeout = *timeout, - }; int status = 0; nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); + exception->retry = 0; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -9758,11 +9786,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) goto out; if (task->tk_status < 0) { - status = nfs4_layoutget_handle_exception(task, lgp, &exception); - *timeout = exception.timeout; + exception->retry = 1; + status = nfs4_layoutget_handle_exception(task, lgp, exception); } else if (lgp->res.layoutp->len == 0) { + exception->retry = 1; status = -EAGAIN; - *timeout = nfs4_update_delay(&exception.timeout); + nfs4_update_delay(&exception->timeout); } else lseg = pnfs_layout_process(lgp); out: @@ -10625,7 +10654,9 @@ static void nfs4_disable_swap(struct inode *inode) */ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { @@ -10740,7 +10771,7 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { }; #endif -const struct xattr_handler *nfs4_xattr_handlers[] = { +const struct xattr_handler * const nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, #if defined(CONFIG_NFS_V4_1) &nfs4_xattr_nfs4_dacl_handler, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e079987af4a3..9a5d911a7edc 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; - if (clp->cl_rpcclient->cl_shutdown) + if (clnt->cl_shutdown) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { - wake_up_var(&clp->cl_state); - return; + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } } - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) __func__, PTR_ERR(task)); if (!nfs_client_init_is_complete(clp)) nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2703,6 +2714,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); + if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, + &clp->cl_state)) { + memflags = memalloc_nofs_save(); + continue; + } + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); @@ -2741,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); again: - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); - if (atomic_read(&cl->cl_swapper)) { + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { wait_var_event_interruptible(&clp->cl_state, test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)); - if (atomic_read(&cl->cl_swapper) && - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); } - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); if (refcount_read(&clp->cl_count) > 1 && !signalled() && test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; nfs_put_client(clp); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 306cba0b9e69..21a365357629 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1980,7 +1980,9 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; struct nfs4_layoutget *lgp; nfs4_stateid stateid; - long timeout = 0; + struct nfs4_exception exception = { + .inode = ino, + }; unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; @@ -2144,7 +2146,7 @@ lookup_again: lgp->lo = lo; pnfs_get_layout_hdr(lo); - lseg = nfs4_proc_layoutget(lgp, &timeout); + lseg = nfs4_proc_layoutget(lgp, &exception); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); nfs_layoutget_end(lo); @@ -2171,6 +2173,8 @@ lookup_again: goto out_put_layout_hdr; } if (lseg) { + if (!exception.retry) + goto out_put_layout_hdr; if (first) pnfs_clear_first_layoutget(lo); trace_pnfs_update_layout(ino, pos, count, @@ -2634,31 +2638,44 @@ pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, return mode == 0; } -static int -pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data) +static int pnfs_layout_return_unused_byserver(struct nfs_server *server, + void *data) { const struct pnfs_layout_range *range = data; + const struct cred *cred; struct pnfs_layout_hdr *lo; struct inode *inode; + nfs4_stateid stateid; + enum pnfs_iomode iomode; + restart: rcu_read_lock(); list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { - if (!pnfs_layout_can_be_returned(lo) || + inode = lo->plh_inode; + if (!inode || !pnfs_layout_can_be_returned(lo) || test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) continue; - inode = lo->plh_inode; spin_lock(&inode->i_lock); - if (!pnfs_should_return_unused_layout(lo, range)) { + if (!lo->plh_inode || + !pnfs_should_return_unused_layout(lo, range)) { spin_unlock(&inode->i_lock); continue; } + pnfs_get_layout_hdr(lo); + pnfs_set_plh_return_info(lo, range->iomode, 0); + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + range, 0) != 0 || + !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } spin_unlock(&inode->i_lock); - inode = pnfs_grab_inode_layout_hdr(lo); - if (!inode) - continue; rcu_read_unlock(); - pnfs_mark_layout_for_return(inode, range); - iput(inode); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); + pnfs_put_layout_hdr(lo); cond_resched(); goto restart; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d886c8226d8f..db57a85500ee 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -35,6 +35,7 @@ #include <linux/nfs_page.h> #include <linux/workqueue.h> +struct nfs4_exception; struct nfs4_opendata; enum { @@ -245,7 +246,9 @@ extern size_t max_response_pages(struct nfs_server *server); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, const struct cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout); +extern struct pnfs_layout_segment * +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, + struct nfs4_exception *exception); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index ddbbf4fcda86..178001c90156 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -154,7 +154,7 @@ nfs4_get_device_info(struct nfs_server *server, set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: - for (i = 0; i < max_pages; i++) + while (--i >= 0) __free_page(pages[i]); kfree(pages); out_free_pdev: diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index a0112ad4937a..afd23910f3bf 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -852,6 +852,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, { struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; + unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10; int status = 0; dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); @@ -870,6 +871,8 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, + .connect_timeout = connect_timeout, + .reconnect_timeout = connect_timeout, }; if (da->da_transport != clp->cl_proto) @@ -943,7 +946,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, * Test this address for session trunking and * add as an alias */ - xprtdata.cred = nfs4_get_clid_cred(clp), + xprtdata.cred = nfs4_get_clid_cred(clp); rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_setup_test_and_add_xprt, &rpcdata); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index e3570c656b0f..ad3a321ae997 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -396,9 +396,10 @@ nfs_proc_link(struct inode *inode, struct inode *dir, const struct qstr *name) } static int -nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, +nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct folio *folio, unsigned int len, struct iattr *sattr) { + struct page *page = &folio->page; struct nfs_fh *fh; struct nfs_fattr *fattr; struct nfs_symlinkargs arg = { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f71eeee67e20..7dc21a48e3e7 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -47,6 +47,8 @@ static struct nfs_pgio_header *nfs_readhdr_alloc(void) static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) { + if (rhdr->res.scratch != NULL) + kfree(rhdr->res.scratch); kmem_cache_free(nfs_rdata_cachep, rhdr); } @@ -108,6 +110,14 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size) +{ + WARN_ON(hdr->res.scratch != NULL); + hdr->res.scratch = kmalloc(size, GFP_KERNEL); + return hdr->res.scratch != NULL; +} +EXPORT_SYMBOL_GPL(nfs_read_alloc_scratch); + static void nfs_readpage_release(struct nfs_page *req, int error) { struct folio *folio = nfs_page_to_folio(req); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2284f749d892..075b31c93f87 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -129,11 +129,7 @@ static void nfs_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static struct shrinker acl_shrinker = { - .count_objects = nfs_access_cache_count, - .scan_objects = nfs_access_cache_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *acl_shrinker; /* * Register the NFS filesystems @@ -153,9 +149,18 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker, "nfs-acl"); - if (ret < 0) + + acl_shrinker = shrinker_alloc(0, "nfs-acl"); + if (!acl_shrinker) { + ret = -ENOMEM; goto error_3; + } + + acl_shrinker->count_objects = nfs_access_cache_count; + acl_shrinker->scan_objects = nfs_access_cache_scan; + + shrinker_register(acl_shrinker); + #ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); #endif @@ -175,7 +180,7 @@ error_0: */ void __exit unregister_nfs_fs(void) { - unregister_shrinker(&acl_shrinker); + shrinker_free(acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); #ifdef CONFIG_NFS_V4_2 @@ -1071,7 +1076,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) sb->s_export_op = &nfs_export_ops; break; case 4: - sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_NOUMASK; sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; @@ -1339,15 +1344,13 @@ error_splat_super: void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); - dev_t dev = s->s_dev; nfs_sysfs_move_sb_to_server(server); - generic_shutdown_super(s); + kill_anon_super(s); nfs_fscache_release_super_cookie(s); nfs_free_server(server); - free_anon_bdev(dev); } EXPORT_SYMBOL_GPL(nfs_kill_super); @@ -1368,6 +1371,7 @@ unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; +short nfs_delay_retrans = -1; EXPORT_SYMBOL_GPL(nfs_callback_nr_threads); EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); @@ -1378,6 +1382,7 @@ EXPORT_SYMBOL_GPL(max_session_cb_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); EXPORT_SYMBOL_GPL(recover_lost_locks); +EXPORT_SYMBOL_GPL(nfs_delay_retrans); #define NFS_CALLBACK_MAXPORTNR (65535U) @@ -1426,5 +1431,9 @@ MODULE_PARM_DESC(recover_lost_locks, "If the server reports that a lock might be lost, " "try to recover it risking data corruption."); - +module_param_named(delay_retrans, nfs_delay_retrans, short, 0644); +MODULE_PARM_DESC(delay_retrans, + "Unless negative, specifies the number of times the NFSv4 " + "client retries a request before returning an EAGAIN error, " + "after a reply of NFS4ERR_DELAY from the server."); #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f4cca8f00c0c..b664caea8b4e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * @@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; + struct nfs_commit_info cinfo; int ret; + nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) return ERR_PTR(ret); } - nfs_join_page_group(head, inode); + nfs_join_page_group(head, &cinfo, inode); return head; } @@ -736,6 +739,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) &pgio); pgio.pg_error = 0; nfs_pageio_complete(&pgio); + if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR) + break; } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); @@ -785,6 +790,8 @@ static void nfs_inode_add_request(struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); @@ -799,8 +806,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { + atomic_long_dec(&nfsi->nrequests); nfs_release_request(req); - atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests); } } @@ -955,18 +962,16 @@ static void nfs_folio_clear_commit(struct folio *folio) } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_folio_clear_commit(nfs_page_to_folio(req)); diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 6fffc8f03f74..b8736a82e57c 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -12,7 +12,8 @@ nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o \ - stats.o filecache.o nfs3proc.o nfs3xdr.o + stats.o filecache.o nfs3proc.o nfs3xdr.o \ + netlink.o nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index fdf2aad73470..e6beaaf4f170 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -26,8 +26,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int i; int flags = nfsexp_flags(rqstp, exp); - validate_process_creds(); - /* discard any old override before preparing the new set */ revert_creds(get_cred(current_real_cred())); new = prepare_creds(); @@ -81,10 +79,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); - validate_process_creds(); put_cred(override_creds(new)); put_cred(new); - validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 01d7fd108cf3..46fd74d91ea9 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -117,12 +117,13 @@ static __be32 nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, struct iomap *iomaps, int nr_iomaps) { + struct timespec64 mtime = inode_get_mtime(inode); loff_t new_size = lcp->lc_last_wr + 1; struct iattr iattr = { .ia_valid = 0 }; int error; if (lcp->lc_mtime.tv_nsec == UTIME_NOW || - timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) + timespec64_compare(&lcp->lc_mtime, &mtime) < 0) lcp->lc_mtime = current_time(inode); iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 8e9c1a0f8d38..ce78f74715ee 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -16,9 +16,9 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp) + const struct nfsd4_layoutget *lgp) { - struct pnfs_block_extent *b = lgp->lg_content; + const struct pnfs_block_extent *b = lgp->lg_content; int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); __be32 *p; @@ -77,12 +77,21 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp) + const struct nfsd4_getdeviceinfo *gdp) { struct pnfs_block_deviceaddr *dev = gdp->gd_device; int len = sizeof(__be32), ret, i; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + p = xdr_reserve_space(xdr, len + sizeof(__be32)); if (!p) return nfserr_resource; diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index bc5166bfe46b..b0361e8aa9a7 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -51,9 +51,9 @@ struct pnfs_block_deviceaddr { }; __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp); + const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp); + const struct nfsd4_layoutget *lgp); int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, u32 block_size); int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4c9b87850ab1..4cbe0434cbb8 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -19,7 +19,7 @@ * typical sockaddr_storage. This is for space reasons, since sockaddr_storage * is much larger than a sockaddr_in6. */ -struct svc_cacherep { +struct nfsd_cacherep { struct { /* Keep often-read xid, csum in the same cache line: */ __be32 k_xid; @@ -84,8 +84,10 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn); void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); -int nfsd_cache_lookup(struct svc_rqst *); -void nfsd_cache_update(struct svc_rqst *, int, __be32 *); +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep); +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp); int nfsd_reply_cache_stats_show(struct seq_file *m, void *v); #endif /* NFSCACHE_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 11a0eaa2f914..7b641095a665 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -339,12 +339,16 @@ static int export_stats_init(struct export_stats *stats) static void export_stats_reset(struct export_stats *stats) { - nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); + if (stats) + nfsd_percpu_counters_reset(stats->counter, + EXP_STATS_COUNTERS_NUM); } static void export_stats_destroy(struct export_stats *stats) { - nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); + if (stats) + nfsd_percpu_counters_destroy(stats->counter, + EXP_STATS_COUNTERS_NUM); } static void svc_export_put(struct kref *ref) @@ -353,7 +357,8 @@ static void svc_export_put(struct kref *ref) path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); - export_stats_destroy(&exp->ex_stats); + export_stats_destroy(exp->ex_stats); + kfree(exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } @@ -421,8 +426,7 @@ static int check_export(struct path *path, int *flags, unsigned char *uuid) return -EINVAL; } - if (!inode->i_sb->s_export_op || - !inode->i_sb->s_export_op->fh_to_dentry) { + if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) { dprintk("exp_export: export of invalid fs type.\n"); return -EINVAL; } @@ -767,13 +771,15 @@ static int svc_export_show(struct seq_file *m, seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); if (export_stats) { - seq_printf(m, "\t%lld\n", exp->ex_stats.start_time); + struct percpu_counter *counter = exp->ex_stats->counter; + + seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); seq_printf(m, "\tfh_stale: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE])); + percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); seq_printf(m, "\tio_read: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ])); + percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); seq_printf(m, "\tio_write: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE])); + percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); seq_putc(m, '\n'); return 0; } @@ -819,7 +825,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; - export_stats_reset(&new->ex_stats); + export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -856,7 +862,14 @@ static struct cache_head *svc_export_alloc(void) if (!i) return NULL; - if (export_stats_init(&i->ex_stats)) { + i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL); + if (!i->ex_stats) { + kfree(i); + return NULL; + } + + if (export_stats_init(i->ex_stats)) { + kfree(i->ex_stats); kfree(i); return NULL; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 2df8ae25aad3..ca9dc230ae3d 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -64,10 +64,10 @@ struct svc_export { struct cache_head h; struct auth_domain * ex_client; int ex_flags; + int ex_fsid; struct path ex_path; kuid_t ex_anon_uid; kgid_t ex_anon_gid; - int ex_fsid; unsigned char * ex_uuid; /* 16 byte fsid */ struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; @@ -76,8 +76,8 @@ struct svc_export { struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; struct rcu_head ex_rcu; - struct export_stats ex_stats; unsigned long ex_xprtsec_modes; + struct export_stats *ex_stats; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ee9c923192e0..ef063f93fde9 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -521,11 +521,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) return ret; } -static struct shrinker nfsd_file_shrinker = { - .scan_objects = nfsd_file_lru_scan, - .count_objects = nfsd_file_lru_count, - .seeks = 1, -}; +static struct shrinker *nfsd_file_shrinker; /** * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file @@ -746,12 +742,19 @@ nfsd_file_cache_init(void) goto out_err; } - ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache"); - if (ret) { - pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache"); + if (!nfsd_file_shrinker) { + ret = -ENOMEM; + pr_err("nfsd: failed to allocate nfsd_file_shrinker\n"); goto out_lru; } + nfsd_file_shrinker->count_objects = nfsd_file_lru_count; + nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan; + nfsd_file_shrinker->seeks = 1; + + shrinker_register(nfsd_file_shrinker); + ret = lease_register_notifier(&nfsd_file_lease_notifier); if (ret) { pr_err("nfsd: unable to register lease notifier: %d\n", ret); @@ -774,7 +777,7 @@ out: out_notifier: lease_unregister_notifier(&nfsd_file_lease_notifier); out_shrinker: - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); out_lru: list_lru_destroy(&nfsd_file_lru); out_err: @@ -891,7 +894,7 @@ nfsd_file_cache_shutdown(void) return; lease_unregister_notifier(&nfsd_file_lease_notifier); - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); /* * make sure all callers of nfsd_file_lru_cb are done before * calling nfsd_file_cache_purge @@ -989,22 +992,21 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned char need = may_flags & NFSD_FILE_MAY_MASK; struct net *net = SVC_NET(rqstp); struct nfsd_file *new, *nf; - const struct cred *cred; + bool stale_retry = true; bool open_retry = true; struct inode *inode; __be32 status; int ret; +retry: status = fh_verify(rqstp, fhp, S_IFREG, may_flags|NFSD_MAY_OWNER_OVERRIDE); if (status != nfs_ok) return status; inode = d_inode(fhp->fh_dentry); - cred = get_current_cred(); -retry: rcu_read_lock(); - nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); + nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); rcu_read_unlock(); if (nf) { @@ -1026,7 +1028,7 @@ retry: rcu_read_lock(); spin_lock(&inode->i_lock); - nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); + nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); if (unlikely(nf)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); @@ -1058,6 +1060,7 @@ wait_for_construction: goto construction_err; } open_retry = false; + fh_put(fhp); goto retry; } this_cpu_inc(nfsd_file_cache_hits); @@ -1074,7 +1077,6 @@ out: nfsd_file_check_write_error(nf); *pnf = nf; } - put_cred(cred); trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); return status; @@ -1088,8 +1090,20 @@ open_file: status = nfs_ok; trace_nfsd_file_opened(nf, status); } else { - status = nfsd_open_verified(rqstp, fhp, may_flags, - &nf->nf_file); + ret = nfsd_open_verified(rqstp, fhp, may_flags, + &nf->nf_file); + if (ret == -EOPENSTALE && stale_retry) { + stale_retry = false; + nfsd_file_unhash(nf); + clear_and_wake_up_bit(NFSD_FILE_PENDING, + &nf->nf_flags); + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); + nf = NULL; + fh_put(fhp); + goto retry; + } + status = nfserrno(ret); trace_nfsd_file_open(nf, status); } } else diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index e81d2a5cf381..aeb71c10ff1b 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -17,9 +17,9 @@ struct ff_idmap { __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp) + const struct nfsd4_layoutget *lgp) { - struct pnfs_ff_layout *fl = lgp->lg_content; + const struct pnfs_ff_layout *fl = lgp->lg_content; int len, mirror_len, ds_len, fh_len; __be32 *p; @@ -77,7 +77,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp) + const struct nfsd4_getdeviceinfo *gdp) { struct pnfs_ff_device_addr *da = gdp->gd_device; int len; @@ -85,6 +85,15 @@ nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, int addr_len; __be32 *p; + /* + * See paragraph 5 of RFC 8881 S18.40.3. + */ + if (!gdp->gd_maxcount) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + /* len + padding for two strings */ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; ver_len = 20; diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h index 8e195aeca023..6d5a1066a903 100644 --- a/fs/nfsd/flexfilelayoutxdr.h +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -43,8 +43,8 @@ struct pnfs_ff_layout { }; __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp); + const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp); + const struct nfsd4_layoutget *lgp); #endif /* _NFSD_FLEXFILELAYOUTXDR_H */ diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c new file mode 100644 index 000000000000..0e1d635ec5f9 --- /dev/null +++ b/fs/nfsd/netlink.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/nfsd.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netlink.h" + +#include <uapi/linux/nfsd_netlink.h> + +/* Ops table for nfsd */ +static const struct genl_split_ops nfsd_nl_ops[] = { + { + .cmd = NFSD_CMD_RPC_STATUS_GET, + .start = nfsd_nl_rpc_status_get_start, + .dumpit = nfsd_nl_rpc_status_get_dumpit, + .done = nfsd_nl_rpc_status_get_done, + .flags = GENL_CMD_CAP_DUMP, + }, +}; + +struct genl_family nfsd_nl_family __ro_after_init = { + .name = NFSD_FAMILY_NAME, + .version = NFSD_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = nfsd_nl_ops, + .n_split_ops = ARRAY_SIZE(nfsd_nl_ops), +}; diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h new file mode 100644 index 000000000000..d83dd6bdee92 --- /dev/null +++ b/fs/nfsd/netlink.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/nfsd.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_NFSD_GEN_H +#define _LINUX_NFSD_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/nfsd_netlink.h> + +int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb); +int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb); + +int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); + +extern struct genl_family nfsd_nl_family; + +#endif /* _LINUX_NFSD_GEN_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ec49b200b797..ab303a8b77d5 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -177,7 +177,7 @@ struct nfsd_net { /* size of cache when we saw the longest hash chain */ unsigned int longest_chain_cachesize; - struct shrinker nfsd_reply_cache_shrinker; + struct shrinker *nfsd_reply_cache_shrinker; /* tracking server-to-server copy mounts */ spinlock_t nfsd_ssc_lock; @@ -195,7 +195,7 @@ struct nfsd_net { int nfs4_max_clients; atomic_t nfsd_courtesy_clients; - struct shrinker nfsd_client_shrinker; + struct shrinker *nfsd_client_shrinker; struct work_struct nfsd_shrinker_work; }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index fc8d5b7db9f8..b78eceebd945 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -171,7 +171,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) * + 1 (xdr opaque byte count) = 26 */ resp->count = argp->count; - svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); + svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3) << 2) + + resp->count + 4); fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, @@ -194,7 +195,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->len, (unsigned long long) argp->offset, - argp->stable? " stable" : ""); + argp->stable ? " stable" : ""); resp->status = nfserr_fbig; if (argp->offset > (u64)OFFSET_MAX || @@ -294,8 +295,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = nfserr_exist; break; case NFS3_CREATE_EXCLUSIVE: - if (d_inode(child)->i_mtime.tv_sec == v_mtime && - d_inode(child)->i_atime.tv_sec == v_atime && + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && d_inode(child)->i_size == 0) { break; } @@ -307,7 +308,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true); if (host_err < 0) { status = nfserrno(host_err); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 518203821790..96e786b5e544 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -441,7 +441,7 @@ struct posix_ace_state_array { * calculated so far: */ struct posix_acl_state { - int empty; + unsigned char valid; struct posix_ace_state owner; struct posix_ace_state group; struct posix_ace_state other; @@ -457,7 +457,6 @@ init_state(struct posix_acl_state *state, int cnt) int alloc; memset(state, 0, sizeof(struct posix_acl_state)); - state->empty = 1; /* * In the worst case, each individual acl could be for a distinct * named user or group, but we don't know which, so we allocate @@ -500,7 +499,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) * and effective cases: when there are no inheritable ACEs, * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + if (!state->valid && (flags & NFS4_ACL_TYPE_DEFAULT)) return NULL; /* @@ -622,11 +621,12 @@ static void process_one_v4_ace(struct posix_acl_state *state, struct nfs4_ace *ace) { u32 mask = ace->access_mask; + short type = ace2type(ace); int i; - state->empty = 0; + state->valid |= type; - switch (ace2type(ace)) { + switch (type) { case ACL_USER_OBJ: if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { allow_bits(&state->owner, mask); @@ -726,6 +726,30 @@ static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE)) process_one_v4_ace(&effective_acl_state, ace); } + + /* + * At this point, the default ACL may have zeroed-out entries for owner, + * group and other. That usually results in a non-sensical resulting ACL + * that denies all access except to any ACE that was explicitly added. + * + * The setfacl command solves a similar problem with this logic: + * + * "If a Default ACL entry is created, and the Default ACL contains + * no owner, owning group, or others entry, a copy of the ACL + * owner, owning group, or others entry is added to the Default ACL." + * + * Copy any missing ACEs from the effective set, if any ACEs were + * explicitly set. + */ + if (default_acl_state.valid) { + if (!(default_acl_state.valid & ACL_USER_OBJ)) + default_acl_state.owner = effective_acl_state.owner; + if (!(default_acl_state.valid & ACL_GROUP_OBJ)) + default_acl_state.group = effective_acl_state.group; + if (!(default_acl_state.valid & ACL_OTHER)) + default_acl_state.other = effective_acl_state.other; + } + *pacl = posix_state_to_acl(&effective_acl_state, flags); if (IS_ERR(*pacl)) { ret = PTR_ERR(*pacl); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index e8a80052cb1b..5e8096bc5eaa 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -515,11 +515,11 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, if (!list_empty(&ls->ls_layouts)) { if (found) nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); - lrp->lrs_present = 1; + lrp->lrs_present = true; } else { trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); nfs4_unhash_stid(&ls->ls_stid); - lrp->lrs_present = 0; + lrp->lrs_present = false; } spin_unlock(&ls->ls_lock); @@ -539,7 +539,7 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp, struct nfs4_layout *lp, *t; LIST_HEAD(reaplist); - lrp->lrs_present = 0; + lrp->lrs_present = false; spin_lock(&clp->cl_lock); list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5ae670807449..6f2d4aa4970d 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -297,12 +297,12 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, } if (d_really_is_positive(child)) { - status = nfs_ok; - /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(fhp); + status = fh_fill_both_attrs(fhp); + if (status != nfs_ok) + goto out; switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: @@ -322,8 +322,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = nfserr_exist; break; case NFS4_CREATE_EXCLUSIVE: - if (d_inode(child)->i_mtime.tv_sec == v_mtime && - d_inode(child)->i_atime.tv_sec == v_atime && + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && d_inode(child)->i_size == 0) { open->op_created = true; break; /* subtle */ @@ -331,8 +331,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = nfserr_exist; break; case NFS4_CREATE_EXCLUSIVE4_1: - if (d_inode(child)->i_mtime.tv_sec == v_mtime && - d_inode(child)->i_atime.tv_sec == v_atime && + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && d_inode(child)->i_size == 0) { open->op_created = true; goto set_attr; /* subtle */ @@ -345,7 +345,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(inode)) iap->ia_mode &= ~current_umask(); - fh_fill_pre_attrs(fhp); + status = fh_fill_pre_attrs(fhp); + if (status != nfs_ok) + goto out; status = nfsd4_vfs_create(fhp, child, open); if (status != nfs_ok) goto out; @@ -380,6 +382,38 @@ out: return status; } +/** + * set_change_info - set up the change_info4 for a reply + * @cinfo: pointer to nfsd4_change_info to be populated + * @fhp: pointer to svc_fh to use as source + * + * Many operations in NFSv4 require change_info4 in the reply. This function + * populates that from the info that we (should!) have already collected. In + * the event that we didn't get any pre-attrs, just zero out both. + */ +static void +set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) +{ + cinfo->atomic = (u32)(fhp->fh_pre_saved && fhp->fh_post_saved && !fhp->fh_no_atomic_attr); + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + + /* + * If fetching the pre-change attributes failed, then we should + * have already failed the whole operation. We could have still + * failed to fetch post-change attributes however. + * + * If we didn't get post-op attrs, just zero-out the after + * field since we don't know what it should be. If the pre_saved + * field isn't set for some reason, throw warning and just copy + * whatever is in the after field. + */ + if (WARN_ON_ONCE(!fhp->fh_pre_saved)) + cinfo->before_change = 0; + if (!fhp->fh_post_saved) + cinfo->after_change = cinfo->before_change + 1; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { @@ -424,11 +458,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru } else { status = nfsd_lookup(rqstp, current_fh, open->op_fname, open->op_fnamelen, *resfh); - if (!status) + if (status == nfs_ok) /* NFSv4 protocol requires change attributes even though * no change happened. */ - fh_fill_both_attrs(current_fh); + status = fh_fill_both_attrs(current_fh); } if (status) goto out; @@ -1024,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rename->rn_tname, rename->rn_tnamelen); if (status) return status; - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); + set_change_info(&rename->rn_sinfo, &cstate->save_fh); + set_change_info(&rename->rn_tinfo, &cstate->current_fh); return nfs_ok; } @@ -1295,7 +1329,8 @@ extern void nfs_sb_deactive(struct super_block *sb); * setup a work entry in the ssc delayed unmount list. */ static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, - struct nfsd4_ssc_umount_item **nsui) + struct nfsd4_ssc_umount_item **nsui, + struct svc_rqst *rqstp) { struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *work = NULL; @@ -1313,12 +1348,11 @@ try_again: /* found a match */ if (ni->nsui_busy) { /* wait - and try again */ - prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, - TASK_INTERRUPTIBLE); + prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, TASK_IDLE); spin_unlock(&nn->nfsd_ssc_lock); /* allow 20secs for mount/unmount for now - revisit */ - if (signal_pending(current) || + if (svc_thread_should_stop(rqstp) || (schedule_timeout(20*HZ) == 0)) { finish_wait(&nn->nfsd_ssc_waitq, &wait); kfree(work); @@ -1434,7 +1468,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, goto out_free_rawdata; snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); - status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui); + status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp); if (status) goto out_free_devname; if ((*nsui)->nsui_vfsmount) @@ -1609,6 +1643,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, if (bytes_total == 0) bytes_total = ULLONG_MAX; do { + /* Only async copies can be stopped here */ if (kthread_should_stop()) break; bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos, @@ -1727,6 +1762,7 @@ static int nfsd4_do_async_copy(void *data) struct nfsd4_copy *copy = (struct nfsd4_copy *)data; __be32 nfserr; + trace_nfsd_copy_do_async(copy); if (nfsd4_ssc_is_inter(copy)) { struct file *filp; @@ -1765,21 +1801,27 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_copy *async_copy = NULL; + copy->cp_clp = cstate->clp; if (nfsd4_ssc_is_inter(copy)) { + trace_nfsd_copy_inter(copy); if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) { status = nfserr_notsupp; goto out; } status = nfsd4_setup_inter_ssc(rqstp, cstate, copy); - if (status) + if (status) { + trace_nfsd_copy_done(copy, status); return nfserr_offload_denied; + } } else { + trace_nfsd_copy_intra(copy); status = nfsd4_setup_intra_ssc(rqstp, cstate, copy); - if (status) + if (status) { + trace_nfsd_copy_done(copy, status); return status; + } } - copy->cp_clp = cstate->clp; memcpy(©->fh, &cstate->current_fh.fh_handle, sizeof(struct knfsd_fh)); if (nfsd4_copy_is_async(copy)) { @@ -1814,6 +1856,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, copy->nf_dst->nf_file, true); } out: + trace_nfsd_copy_done(copy, status); release_copy_files(copy); return status; out_err: @@ -1896,8 +1939,8 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - cn->cpn_sec = nn->nfsd4_lease; - cn->cpn_nsec = 0; + cn->cpn_lease_time.tv_sec = nn->nfsd4_lease; + cn->cpn_lease_time.tv_nsec = 0; status = nfserrno(-ENOMEM); cps = nfs4_alloc_init_cpntf_state(nn, stid); @@ -2314,10 +2357,10 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, mutex_unlock(&ls->ls_mutex); if (new_size > i_size_read(inode)) { - lcp->lc_size_chg = 1; + lcp->lc_size_chg = true; lcp->lc_newsize = new_size; } else { - lcp->lc_size_chg = 0; + lcp->lc_size_chg = false; } nfserr = ops->proc_layoutcommit(inode, lcp); @@ -3167,6 +3210,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCK] = { .op_func = nfsd4_lock, + .op_release = nfsd4_lock_release, .op_flags = OP_MODIFIES_SOMETHING | OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCK", @@ -3175,6 +3219,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCKT] = { .op_func = nfsd4_lockt, + .op_release = nfsd4_lockt_release, .op_flags = OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCKT", .op_rsize_bop = nfsd4_lock_rsize, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index daf305daa751..3edbfa0233e6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -59,7 +59,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -#define all_ones {{~0,~0},~0} +#define all_ones {{ ~0, ~0}, ~0} static const stateid_t one_stateid = { .si_generation = ~0, .si_opaque = all_ones, @@ -297,7 +297,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, nbl = find_blocked_lock(lo, fh, nn); if (!nbl) { - nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + nbl = kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { INIT_LIST_HEAD(&nbl->nbl_list); INIT_LIST_HEAD(&nbl->nbl_lru); @@ -649,6 +649,18 @@ find_readable_file(struct nfs4_file *f) return ret; } +static struct nfsd_file * +find_rw_file(struct nfs4_file *f) +{ + struct nfsd_file *ret; + + spin_lock(&f->fi_lock); + ret = nfsd_file_get(f->fi_fds[O_RDWR]); + spin_unlock(&f->fi_lock); + + return ret; +} + struct nfsd_file * find_any_file(struct nfs4_file *f) { @@ -1144,9 +1156,10 @@ static void block_delegations(struct knfsd_fh *fh) static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, - struct nfs4_clnt_odstate *odstate) + struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; + struct nfs4_stid *stid; long n; dprintk("NFSD alloc_init_deleg\n"); @@ -1155,9 +1168,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, goto out_dec; if (delegation_blocked(&fp->fi_fhandle)) goto out_dec; - dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg)); - if (dp == NULL) + stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg); + if (stid == NULL) goto out_dec; + dp = delegstateid(stid); /* * delegation seqid's are never incremented. The 4.1 special @@ -1170,7 +1184,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_clnt_odstate = odstate; get_clnt_odstate(odstate); - dp->dl_type = NFS4_OPEN_DELEGATE_READ; + dp->dl_type = dl_type; dp->dl_retries = 1; dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -2785,7 +2799,7 @@ static int client_opens_release(struct inode *inode, struct file *file) /* XXX: alternatively, we could get/drop in seq start/stop */ drop_client(clp); - return 0; + return seq_release(inode, file); } static const struct file_operations client_states_fops = { @@ -4388,8 +4402,7 @@ static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int count; - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_client_shrinker); + struct nfsd_net *nn = shrink->private_data; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) @@ -5449,8 +5462,9 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; struct nfs4_delegation *dp; - struct nfsd_file *nf; + struct nfsd_file *nf = NULL; struct file_lock *fl; + u32 dl_type; /* * The fi_had_conflict and nfs_get_existing_delegation checks @@ -5460,15 +5474,35 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - nf = find_readable_file(fp); - if (!nf) { - /* - * We probably could attempt another open and get a read - * delegation, but for now, don't bother until the - * client actually sends us one. - */ - return ERR_PTR(-EAGAIN); + /* + * Try for a write delegation first. RFC8881 section 10.4 says: + * + * "An OPEN_DELEGATE_WRITE delegation allows the client to handle, + * on its own, all opens." + * + * Furthermore the client can use a write delegation for most READ + * operations as well, so we require a O_RDWR file here. + * + * Offer a write delegation in the case of a BOTH open, and ensure + * we get the O_RDWR descriptor. + */ + if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) { + nf = find_rw_file(fp); + dl_type = NFS4_OPEN_DELEGATE_WRITE; + } + + /* + * If the file is being opened O_RDONLY or we couldn't get a O_RDWR + * file for some reason, then try for a read delegation instead. + */ + if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) { + nf = find_readable_file(fp); + dl_type = NFS4_OPEN_DELEGATE_READ; } + + if (!nf) + return ERR_PTR(-EAGAIN); + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) @@ -5491,11 +5525,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, return ERR_PTR(status); status = -ENOMEM; - dp = alloc_init_deleg(clp, fp, odstate); + dp = alloc_init_deleg(clp, fp, odstate, dl_type); if (!dp) goto out_delegees; - fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); + fl = nfs4_alloc_init_lease(dp, dl_type); if (!fl) goto out_clnt_odstate; @@ -5568,10 +5602,28 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) } /* - * Attempt to hand out a delegation. + * The Linux NFS server does not offer write delegations to NFSv4.0 + * clients in order to avoid conflicts between write delegations and + * GETATTRs requesting CHANGE or SIZE attributes. + * + * With NFSv4.1 and later minorversions, the SEQUENCE operation that + * begins each COMPOUND contains a client ID. Delegation recall can + * be avoided when the server recognizes the client sending a + * GETATTR also holds write delegation it conflicts with. + * + * However, the NFSv4.0 protocol does not enable a server to + * determine that a GETATTR originated from the client holding the + * conflicting delegation versus coming from some other client. Per + * RFC 7530 Section 16.7.5, the server must recall or send a + * CB_GETATTR even when the GETATTR originates from the client that + * holds the conflicting delegation. * - * Note we don't support write delegations, and won't until the vfs has - * proper support for them. + * An NFSv4.0 client can trigger a pathological situation if it + * always sends a DELEGRETURN preceded by a conflicting GETATTR in + * the same COMPOUND. COMPOUND execution will always stop at the + * GETATTR and the DELEGRETURN will never get executed. The server + * eventually revokes the delegation, which can result in loss of + * open or lock state. */ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, @@ -5585,13 +5637,11 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, int status = 0; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); - open->op_recall = 0; + open->op_recall = false; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) - open->op_recall = 1; - if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ) - goto out_no_deleg; + open->op_recall = true; break; case NFS4_OPEN_CLAIM_NULL: parent = currentfh; @@ -5606,6 +5656,9 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE && + !clp->cl_minorversion) + goto out_no_deleg; break; default: goto out_no_deleg; @@ -5616,8 +5669,13 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); - trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); - open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; + trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + } else { + open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; + trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); + } nfs4_put_stid(&dp->dl_stid); return; out_no_deleg: @@ -5625,7 +5683,7 @@ out_no_deleg: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) { dprintk("NFSD: WARNING: refusing delegation reclaim\n"); - open->op_recall = 1; + open->op_recall = true; } /* 4.1 client asking for a delegation? */ @@ -7430,6 +7488,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; + struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -7451,6 +7510,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } + sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -7502,7 +7562,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) || + exportfs_lock_op_is_async(sb->s_export_op)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -7514,7 +7575,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) || + exportfs_lock_op_is_async(sb->s_export_op)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -7542,7 +7604,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * for file locks), so don't attempt blocking lock notifications * on those filesystems: */ - if (nf->nf_file->f_op->lock) + if (!exportfs_lock_op_is_async(sb->s_export_op)) fl_flags &= ~FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); @@ -7648,6 +7710,14 @@ out: return status; } +void nfsd4_lock_release(union nfsd4_op_u *u) +{ + struct nfsd4_lock *lock = &u->lock; + struct nfsd4_lock_denied *deny = &lock->lk_denied; + + kfree(deny->ld_owner.data); +} + /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to @@ -7753,6 +7823,14 @@ out: return status; } +void nfsd4_lockt_release(union nfsd4_op_u *u) +{ + struct nfsd4_lockt *lockt = &u->lockt; + struct nfsd4_lock_denied *deny = &lockt->lt_denied; + + kfree(deny->ld_owner.data); +} + __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -8092,12 +8170,16 @@ static int nfs4_state_create_net(struct net *net) INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); - nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; - nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; - nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; - - if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client")) + nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client"); + if (!nn->nfsd_client_shrinker) goto err_shrinker; + + nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_client_shrinker); + return 0; err_shrinker: @@ -8195,7 +8277,7 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_shrinker(&nn->nfsd_client_shrinker); + shrinker_free(nn->nfsd_client_shrinker); cancel_work(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -8341,3 +8423,68 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, { get_stateid(cstate, &u->write.wr_stateid); } + +/** + * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict + * @rqstp: RPC transaction context + * @inode: file to be checked for a conflict + * + * This function is called when there is a conflict between a write + * delegation and a change/size GETATTR from another client. The server + * must either use the CB_GETATTR to get the current values of the + * attributes from the client that holds the delegation or recall the + * delegation before replying to the GETATTR. See RFC 8881 section + * 18.7.4. + * + * The current implementation does not support CB_GETATTR yet. However + * this can avoid recalling the delegation could be added in follow up + * work. + * + * Returns 0 if there is no conflict; otherwise an nfs_stat + * code is returned. + */ +__be32 +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) +{ + __be32 status; + struct file_lock_context *ctx; + struct file_lock *fl; + struct nfs4_delegation *dp; + + ctx = locks_inode_context(inode); + if (!ctx) + return 0; + spin_lock(&ctx->flc_lock); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_flags == FL_LAYOUT) + continue; + if (fl->fl_lmops != &nfsd_lease_mng_ops) { + /* + * non-nfs lease, if it's a lease with F_RDLCK then + * we are done; there isn't any write delegation + * on this inode + */ + if (fl->fl_type == F_RDLCK) + break; + goto break_lease; + } + if (fl->fl_type == F_WRLCK) { + dp = fl->fl_owner; + if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) { + spin_unlock(&ctx->flc_lock); + return 0; + } +break_lease: + spin_unlock(&ctx->flc_lock); + nfsd_stats_wdeleg_getattr_inc(); + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + return 0; + } + break; + } + spin_unlock(&ctx->flc_lock); + return 0; +} diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b30dca7de8cc..b499fe9caa32 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2530,66 +2530,62 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) return true; } -static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, - struct svc_export *exp) +static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr, + struct knfsd_fh *fh_handle) { - if (exp->ex_flags & NFSEXP_V4ROOT) { - *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); - *p++ = 0; - } else - p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode)); - return p; + return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size); } +/* This is a frequently-encoded type; open-coded for speed */ static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr, - struct timespec64 *tv) + const struct timespec64 *tv) { __be32 *p; p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) return nfserr_resource; - - p = xdr_encode_hyper(p, (s64)tv->tv_sec); + p = xdr_encode_hyper(p, tv->tv_sec); *p = cpu_to_be32(tv->tv_nsec); return nfs_ok; } -/* - * ctime (in NFSv4, time_metadata) is not writeable, and the client - * doesn't really care what resolution could theoretically be stored by - * the filesystem. - * - * The client cares how close together changes can be while still - * guaranteeing ctime changes. For most filesystems (which have - * timestamps with nanosecond fields) that is limited by the resolution - * of the time returned from current_time() (which I'm assuming to be - * 1/HZ). - */ -static __be32 *encode_time_delta(__be32 *p, struct inode *inode) +static __be32 nfsd4_encode_specdata4(struct xdr_stream *xdr, + unsigned int major, unsigned int minor) { - struct timespec64 ts; - u32 ns; + __be32 status; - ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); - ts = ns_to_timespec64(ns); + status = nfsd4_encode_uint32_t(xdr, major); + if (status != nfs_ok) + return status; + return nfsd4_encode_uint32_t(xdr, minor); +} - p = xdr_encode_hyper(p, ts.tv_sec); - *p++ = cpu_to_be32(ts.tv_nsec); +static __be32 +nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c) +{ + __be32 status; - return p; + status = nfsd4_encode_bool(xdr, c->atomic); + if (status != nfs_ok) + return status; + status = nfsd4_encode_changeid4(xdr, c->before_change); + if (status != nfs_ok) + return status; + return nfsd4_encode_changeid4(xdr, c->after_change); } -static __be32 -nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c) +static __be32 nfsd4_encode_netaddr4(struct xdr_stream *xdr, + const struct nfs42_netaddr *addr) { - if (xdr_stream_encode_bool(xdr, c->atomic) < 0) - return nfserr_resource; - if (xdr_stream_encode_u64(xdr, c->before_change) < 0) - return nfserr_resource; - if (xdr_stream_encode_u64(xdr, c->after_change) < 0) - return nfserr_resource; - return nfs_ok; + __be32 status; + + /* na_r_netid */ + status = nfsd4_encode_opaque(xdr, addr->netid, addr->netid_len); + if (status != nfs_ok) + return status; + /* na_r_addr */ + return nfsd4_encode_opaque(xdr, addr->addr, addr->addr_len); } /* Encode as an array of strings the string given with components @@ -2661,9 +2657,6 @@ static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); } -/* - * encode a location element of a fs_locations structure - */ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, struct nfsd4_fs_location *location) { @@ -2676,15 +2669,12 @@ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, status = nfsd4_encode_components(xdr, '/', location->path); if (status) return status; - return 0; + return nfs_ok; } -/* - * Encode a path in RFC3530 'pathname4' format - */ -static __be32 nfsd4_encode_path(struct xdr_stream *xdr, - const struct path *root, - const struct path *path) +static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) { struct path cur = *path; __be32 *p; @@ -2752,89 +2742,59 @@ out_free: return err; } -static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, - struct svc_rqst *rqstp, const struct path *path) +static __be32 nfsd4_encode_fs_locations4(struct xdr_stream *xdr, + struct svc_rqst *rqstp, + struct svc_export *exp) { + struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; struct svc_export *exp_ps; - __be32 res; + unsigned int i; + __be32 status; + /* fs_root */ exp_ps = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp_ps)) return nfserrno(PTR_ERR(exp_ps)); - res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); + status = nfsd4_encode_pathname4(xdr, &exp_ps->ex_path, &exp->ex_path); exp_put(exp_ps); - return res; -} - -/* - * encode a fs_locations structure - */ -static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, - struct svc_rqst *rqstp, struct svc_export *exp) -{ - __be32 status; - int i; - __be32 *p; - struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - - status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); - if (status) + if (status != nfs_ok) return status; - p = xdr_reserve_space(xdr, 4); - if (!p) + + /* locations<> */ + if (xdr_stream_encode_u32(xdr, fslocs->locations_count) != XDR_UNIT) return nfserr_resource; - *p++ = cpu_to_be32(fslocs->locations_count); - for (i=0; i<fslocs->locations_count; i++) { + for (i = 0; i < fslocs->locations_count; i++) { status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); - if (status) + if (status != nfs_ok) return status; } - return 0; -} -static u32 nfs4_file_type(umode_t mode) -{ - switch (mode & S_IFMT) { - case S_IFIFO: return NF4FIFO; - case S_IFCHR: return NF4CHR; - case S_IFDIR: return NF4DIR; - case S_IFBLK: return NF4BLK; - case S_IFLNK: return NF4LNK; - case S_IFREG: return NF4REG; - case S_IFSOCK: return NF4SOCK; - default: return NF4BAD; - } + return nfs_ok; } -static inline __be32 -nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, - struct nfs4_ace *ace) +static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) { + __be32 status; + + /* type */ + status = nfsd4_encode_acetype4(xdr, ace->type); + if (status != nfs_ok) + return nfserr_resource; + /* flag */ + status = nfsd4_encode_aceflag4(xdr, ace->flag); + if (status != nfs_ok) + return nfserr_resource; + /* access mask */ + status = nfsd4_encode_acemask4(xdr, ace->access_mask & NFS4_ACE_MASK_ALL); + if (status != nfs_ok) + return nfserr_resource; + /* who */ if (ace->whotype != NFS4_ACL_WHO_NAMED) return nfs4_acl_write_who(xdr, ace->whotype); - else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) return nfsd4_encode_group(xdr, rqstp, ace->who_gid); - else - return nfsd4_encode_user(xdr, rqstp, ace->who_uid); -} - -static inline __be32 -nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) -{ - __be32 *p; - unsigned long i = hweight_long(layout_types); - - p = xdr_reserve_space(xdr, 4 + 4 * i); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(i); - - for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) - if (layout_types & (1 << i)) - *p++ = cpu_to_be32(i); - - return 0; + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -2906,12 +2866,12 @@ static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino) } static __be32 -nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) +nfsd4_encode_bitmap4(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) { __be32 *p; if (bmval2) { - p = xdr_reserve_space(xdr, 16); + p = xdr_reserve_space(xdr, XDR_UNIT * 4); if (!p) goto out_resource; *p++ = cpu_to_be32(3); @@ -2919,89 +2879,684 @@ nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) *p++ = cpu_to_be32(bmval1); *p++ = cpu_to_be32(bmval2); } else if (bmval1) { - p = xdr_reserve_space(xdr, 12); + p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) goto out_resource; *p++ = cpu_to_be32(2); *p++ = cpu_to_be32(bmval0); *p++ = cpu_to_be32(bmval1); } else { - p = xdr_reserve_space(xdr, 8); + p = xdr_reserve_space(xdr, XDR_UNIT * 2); if (!p) goto out_resource; *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(bmval0); } - return 0; + return nfs_ok; out_resource: return nfserr_resource; } +struct nfsd4_fattr_args { + struct svc_rqst *rqstp; + struct svc_fh *fhp; + struct svc_export *exp; + struct dentry *dentry; + struct kstat stat; + struct kstatfs statfs; + struct nfs4_acl *acl; + u64 size; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + void *context; + int contextlen; +#endif + u32 rdattr_err; + bool contextsupport; + bool ignore_crossmnt; +}; + +typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args); + +static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4__true(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_bool(xdr, true); +} + +static __be32 nfsd4_encode_fattr4__false(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_bool(xdr, false); +} + +static __be32 nfsd4_encode_fattr4_supported_attrs(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd4_compoundres *resp = args->rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + if (!IS_POSIXACL(d_inode(args->dentry))) + supp[0] &= ~FATTR4_WORD0_ACL; + if (!args->contextsupport) + supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); +} + +static __be32 nfsd4_encode_fattr4_type(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT); + if (!p) + return nfserr_resource; + + switch (args->stat.mode & S_IFMT) { + case S_IFIFO: + *p = cpu_to_be32(NF4FIFO); + break; + case S_IFCHR: + *p = cpu_to_be32(NF4CHR); + break; + case S_IFDIR: + *p = cpu_to_be32(NF4DIR); + break; + case S_IFBLK: + *p = cpu_to_be32(NF4BLK); + break; + case S_IFLNK: + *p = cpu_to_be32(NF4LNK); + break; + case S_IFREG: + *p = cpu_to_be32(NF4REG); + break; + case S_IFSOCK: + *p = cpu_to_be32(NF4SOCK); + break; + default: + return nfserr_serverfault; + } + + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u32 mask; + + mask = NFS4_FH_PERSISTENT; + if (!(args->exp->ex_flags & NFSEXP_NOSUBTREECHECK)) + mask |= NFS4_FH_VOL_RENAME; + return nfsd4_encode_uint32_t(xdr, mask); +} + +static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + const struct svc_export *exp = args->exp; + u64 c; + + if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) { + u32 flush_time = convert_to_wallclock(exp->cd->flush_time); + + if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + + c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry)); + return nfsd4_encode_changeid4(xdr, c); +} + +static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->size); +} + +static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 2 + XDR_UNIT * 2); + if (!p) + return nfserr_resource; + + if (unlikely(args->exp->ex_fslocs.migrated)) { + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); + return nfs_ok; + } + switch (fsid_source(args->fhp)) { + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64)args->exp->ex_fsid); + xdr_encode_hyper(p, (u64)0); + break; + case FSIDSOURCE_DEV: + *p++ = xdr_zero; + *p++ = cpu_to_be32(MAJOR(args->stat.dev)); + *p++ = xdr_zero; + *p = cpu_to_be32(MINOR(args->stat.dev)); + break; + case FSIDSOURCE_UUID: + xdr_encode_opaque_fixed(p, args->exp->ex_uuid, EX_UUID_LEN); + break; + } + + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_lease_time(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd_net *nn = net_generic(SVC_NET(args->rqstp), nfsd_net_id); + + return nfsd4_encode_nfs_lease4(xdr, nn->nfsd4_lease); +} + +static __be32 nfsd4_encode_fattr4_rdattr_error(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->rdattr_err); +} + +static __be32 nfsd4_encode_fattr4_aclsupport(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u32 mask; + + mask = 0; + if (IS_POSIXACL(d_inode(args->dentry))) + mask = ACL4_SUPPORT_ALLOW_ACL | ACL4_SUPPORT_DENY_ACL; + return nfsd4_encode_uint32_t(xdr, mask); +} + +static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfs4_acl *acl = args->acl; + struct nfs4_ace *ace; + __be32 status; + + /* nfsace4<> */ + if (!acl) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + } else { + if (xdr_stream_encode_u32(xdr, acl->naces) != XDR_UNIT) + return nfserr_resource; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + status = nfsd4_encode_nfsace4(xdr, args->rqstp, ace); + if (status != nfs_ok) + return status; + } + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle); +} + +static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->stat.ino); +} + +static __be32 nfsd4_encode_fattr4_files_avail(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree); +} + +static __be32 nfsd4_encode_fattr4_files_free(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree); +} + +static __be32 nfsd4_encode_fattr4_files_total(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_files); +} + +static __be32 nfsd4_encode_fattr4_fs_locations(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_fs_locations4(xdr, args->rqstp, args->exp); +} + +static __be32 nfsd4_encode_fattr4_maxfilesize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct super_block *sb = args->exp->ex_path.mnt->mnt_sb; + + return nfsd4_encode_uint64_t(xdr, sb->s_maxbytes); +} + +static __be32 nfsd4_encode_fattr4_maxlink(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, 255); +} + +static __be32 nfsd4_encode_fattr4_maxname(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->statfs.f_namelen); +} + +static __be32 nfsd4_encode_fattr4_maxread(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp)); +} + +static __be32 nfsd4_encode_fattr4_maxwrite(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp)); +} + +static __be32 nfsd4_encode_fattr4_mode(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_mode4(xdr, args->stat.mode & S_IALLUGO); +} + +static __be32 nfsd4_encode_fattr4_numlinks(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->stat.nlink); +} + +static __be32 nfsd4_encode_fattr4_owner(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_user(xdr, args->rqstp, args->stat.uid); +} + +static __be32 nfsd4_encode_fattr4_owner_group(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_group(xdr, args->rqstp, args->stat.gid); +} + +static __be32 nfsd4_encode_fattr4_rawdev(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_specdata4(xdr, MAJOR(args->stat.rdev), + MINOR(args->stat.rdev)); +} + +static __be32 nfsd4_encode_fattr4_space_avail(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 avail = (u64)args->statfs.f_bavail * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, avail); +} + +static __be32 nfsd4_encode_fattr4_space_free(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 free = (u64)args->statfs.f_bfree * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, free); +} + +static __be32 nfsd4_encode_fattr4_space_total(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 total = (u64)args->statfs.f_blocks * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, total); +} + +static __be32 nfsd4_encode_fattr4_space_used(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, (u64)args->stat.blocks << 9); +} + +static __be32 nfsd4_encode_fattr4_time_access(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.atime); +} + +static __be32 nfsd4_encode_fattr4_time_create(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.btime); +} + +/* + * ctime (in NFSv4, time_metadata) is not writeable, and the client + * doesn't really care what resolution could theoretically be stored by + * the filesystem. + * + * The client cares how close together changes can be while still + * guaranteeing ctime changes. For most filesystems (which have + * timestamps with nanosecond fields) that is limited by the resolution + * of the time returned from current_time() (which I'm assuming to be + * 1/HZ). + */ +static __be32 nfsd4_encode_fattr4_time_delta(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + const struct inode *inode = d_inode(args->dentry); + u32 ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); + struct timespec64 ts = ns_to_timespec64(ns); + + return nfsd4_encode_nfstime4(xdr, &ts); +} + +static __be32 nfsd4_encode_fattr4_time_metadata(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.ctime); +} + +static __be32 nfsd4_encode_fattr4_time_modify(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.mtime); +} + +static __be32 nfsd4_encode_fattr4_mounted_on_fileid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 ino; + int err; + + if (!args->ignore_crossmnt && + args->dentry == args->exp->ex_path.mnt->mnt_root) { + err = nfsd4_get_mounted_on_ino(args->exp, &ino); + if (err) + return nfserrno(err); + } else + ino = args->stat.ino; + + return nfsd4_encode_uint64_t(xdr, ino); +} + +#ifdef CONFIG_NFSD_PNFS + +static __be32 nfsd4_encode_fattr4_fs_layout_types(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + unsigned long mask = args->exp->ex_layout_types; + int i; + + /* Hamming weight of @mask is the number of layout types to return */ + if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT) + return nfserr_resource; + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (mask & BIT(i)) { + /* layouttype4 */ + if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT) + return nfserr_resource; + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_layout_types(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + unsigned long mask = args->exp->ex_layout_types; + int i; + + /* Hamming weight of @mask is the number of layout types to return */ + if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT) + return nfserr_resource; + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (mask & BIT(i)) { + /* layouttype4 */ + if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT) + return nfserr_resource; + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_layout_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->stat.blksize); +} + +#endif + +static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd4_compoundres *resp = args->rqstp->rq_resp; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[resp->cstate.minorversion], sizeof(supp)); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); +} + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_security_label(xdr, args->rqstp, + args->context, args->contextlen); +} +#endif + +static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + int err = xattr_supports_user_prefix(d_inode(args->dentry)); + + return nfsd4_encode_bool(xdr, err == 0); +} + +static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { + [FATTR4_SUPPORTED_ATTRS] = nfsd4_encode_fattr4_supported_attrs, + [FATTR4_TYPE] = nfsd4_encode_fattr4_type, + [FATTR4_FH_EXPIRE_TYPE] = nfsd4_encode_fattr4_fh_expire_type, + [FATTR4_CHANGE] = nfsd4_encode_fattr4_change, + [FATTR4_SIZE] = nfsd4_encode_fattr4_size, + [FATTR4_LINK_SUPPORT] = nfsd4_encode_fattr4__true, + [FATTR4_SYMLINK_SUPPORT] = nfsd4_encode_fattr4__true, + [FATTR4_NAMED_ATTR] = nfsd4_encode_fattr4__false, + [FATTR4_FSID] = nfsd4_encode_fattr4_fsid, + [FATTR4_UNIQUE_HANDLES] = nfsd4_encode_fattr4__true, + [FATTR4_LEASE_TIME] = nfsd4_encode_fattr4_lease_time, + [FATTR4_RDATTR_ERROR] = nfsd4_encode_fattr4_rdattr_error, + [FATTR4_ACL] = nfsd4_encode_fattr4_acl, + [FATTR4_ACLSUPPORT] = nfsd4_encode_fattr4_aclsupport, + [FATTR4_ARCHIVE] = nfsd4_encode_fattr4__noop, + [FATTR4_CANSETTIME] = nfsd4_encode_fattr4__true, + [FATTR4_CASE_INSENSITIVE] = nfsd4_encode_fattr4__false, + [FATTR4_CASE_PRESERVING] = nfsd4_encode_fattr4__true, + [FATTR4_CHOWN_RESTRICTED] = nfsd4_encode_fattr4__true, + [FATTR4_FILEHANDLE] = nfsd4_encode_fattr4_filehandle, + [FATTR4_FILEID] = nfsd4_encode_fattr4_fileid, + [FATTR4_FILES_AVAIL] = nfsd4_encode_fattr4_files_avail, + [FATTR4_FILES_FREE] = nfsd4_encode_fattr4_files_free, + [FATTR4_FILES_TOTAL] = nfsd4_encode_fattr4_files_total, + [FATTR4_FS_LOCATIONS] = nfsd4_encode_fattr4_fs_locations, + [FATTR4_HIDDEN] = nfsd4_encode_fattr4__noop, + [FATTR4_HOMOGENEOUS] = nfsd4_encode_fattr4__true, + [FATTR4_MAXFILESIZE] = nfsd4_encode_fattr4_maxfilesize, + [FATTR4_MAXLINK] = nfsd4_encode_fattr4_maxlink, + [FATTR4_MAXNAME] = nfsd4_encode_fattr4_maxname, + [FATTR4_MAXREAD] = nfsd4_encode_fattr4_maxread, + [FATTR4_MAXWRITE] = nfsd4_encode_fattr4_maxwrite, + [FATTR4_MIMETYPE] = nfsd4_encode_fattr4__noop, + [FATTR4_MODE] = nfsd4_encode_fattr4_mode, + [FATTR4_NO_TRUNC] = nfsd4_encode_fattr4__true, + [FATTR4_NUMLINKS] = nfsd4_encode_fattr4_numlinks, + [FATTR4_OWNER] = nfsd4_encode_fattr4_owner, + [FATTR4_OWNER_GROUP] = nfsd4_encode_fattr4_owner_group, + [FATTR4_QUOTA_AVAIL_HARD] = nfsd4_encode_fattr4__noop, + [FATTR4_QUOTA_AVAIL_SOFT] = nfsd4_encode_fattr4__noop, + [FATTR4_QUOTA_USED] = nfsd4_encode_fattr4__noop, + [FATTR4_RAWDEV] = nfsd4_encode_fattr4_rawdev, + [FATTR4_SPACE_AVAIL] = nfsd4_encode_fattr4_space_avail, + [FATTR4_SPACE_FREE] = nfsd4_encode_fattr4_space_free, + [FATTR4_SPACE_TOTAL] = nfsd4_encode_fattr4_space_total, + [FATTR4_SPACE_USED] = nfsd4_encode_fattr4_space_used, + [FATTR4_SYSTEM] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_ACCESS] = nfsd4_encode_fattr4_time_access, + [FATTR4_TIME_ACCESS_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_BACKUP] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_CREATE] = nfsd4_encode_fattr4_time_create, + [FATTR4_TIME_DELTA] = nfsd4_encode_fattr4_time_delta, + [FATTR4_TIME_METADATA] = nfsd4_encode_fattr4_time_metadata, + [FATTR4_TIME_MODIFY] = nfsd4_encode_fattr4_time_modify, + [FATTR4_TIME_MODIFY_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_MOUNTED_ON_FILEID] = nfsd4_encode_fattr4_mounted_on_fileid, + [FATTR4_DIR_NOTIF_DELAY] = nfsd4_encode_fattr4__noop, + [FATTR4_DIRENT_NOTIF_DELAY] = nfsd4_encode_fattr4__noop, + [FATTR4_DACL] = nfsd4_encode_fattr4__noop, + [FATTR4_SACL] = nfsd4_encode_fattr4__noop, + [FATTR4_CHANGE_POLICY] = nfsd4_encode_fattr4__noop, + [FATTR4_FS_STATUS] = nfsd4_encode_fattr4__noop, + +#ifdef CONFIG_NFSD_PNFS + [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4_fs_layout_types, + [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4_layout_types, + [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4_layout_blksize, + [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop, +#else + [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop, +#endif + + [FATTR4_FS_LOCATIONS_INFO] = nfsd4_encode_fattr4__noop, + [FATTR4_MDSTHRESHOLD] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_GET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTEVT_GET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTEVT_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_HOLD] = nfsd4_encode_fattr4__noop, + [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop, + [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat, + [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop, + [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop, + [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop, + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4_sec_label, +#else + [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4__noop, +#endif + + [FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop, + [FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support, +}; + /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. */ static __be32 -nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, - struct svc_export *exp, - struct dentry *dentry, u32 *bmval, - struct svc_rqst *rqstp, int ignore_crossmnt) +nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, const u32 *bmval, + int ignore_crossmnt) { - u32 bmval0 = bmval[0]; - u32 bmval1 = bmval[1]; - u32 bmval2 = bmval[2]; - struct kstat stat; + struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; - struct kstatfs statfs; - __be32 *p, *attrlen_p; int starting_len = xdr->buf->len; + __be32 *attrlen_p, status; int attrlen_offset; - u32 dummy; - u64 dummy64; - u32 rdattr_err = 0; - __be32 status; int err; - struct nfs4_acl *acl = NULL; -#ifdef CONFIG_NFSD_V4_SECURITY_LABEL - void *context = NULL; - int contextlen; -#endif - bool contextsupport = false; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; struct path path = { .mnt = exp->ex_path.mnt, .dentry = dentry, }; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + union { + u32 attrmask[3]; + unsigned long mask[2]; + } u; + unsigned long bit; + + WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1); + WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval)); - BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); - BUG_ON(!nfsd_attrs_supported(minorversion, bmval)); + args.rqstp = rqstp; + args.exp = exp; + args.dentry = dentry; + args.ignore_crossmnt = (ignore_crossmnt != 0); + /* + * Make a local copy of the attribute bitmap that can be modified. + */ + memset(&u, 0, sizeof(u)); + u.attrmask[0] = bmval[0]; + u.attrmask[1] = bmval[1]; + u.attrmask[2] = bmval[2]; + + args.rdattr_err = 0; if (exp->ex_fslocs.migrated) { - status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); + status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1], + &u.attrmask[2], &args.rdattr_err); + if (status) + goto out; + } + args.size = 0; + if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry)); if (status) goto out; } - err = vfs_getattr(&path, &stat, + err = vfs_getattr(&path, &args.stat, STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; - if (!(stat.result_mask & STATX_BTIME)) + args.size = args.stat.size; + + if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ - bmval1 &= ~FATTR4_WORD1_TIME_CREATE; - if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; + if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || - (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { - err = vfs_statfs(&path, &statfs); + err = vfs_statfs(&path, &args.statfs); if (err) goto out_nfserr; } - if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { + if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && + !fhp) { tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); status = nfserr_jukebox; if (!tempfh) @@ -3010,12 +3565,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, status = fh_compose(tempfh, exp, dentry, NULL); if (status) goto out; - fhp = tempfh; - } - if (bmval0 & FATTR4_WORD0_ACL) { - err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); + args.fhp = tempfh; + } else + args.fhp = fhp; + + args.acl = NULL; + if (u.attrmask[0] & FATTR4_WORD0_ACL) { + err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl); if (err == -EOPNOTSUPP) - bmval0 &= ~FATTR4_WORD0_ACL; + u.attrmask[0] &= ~FATTR4_WORD0_ACL; else if (err == -EINVAL) { status = nfserr_attrnotsupp; goto out; @@ -3023,452 +3581,53 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out_nfserr; } + args.contextsupport = false; + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || - bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + args.context = NULL; + if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || + u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) err = security_inode_getsecctx(d_inode(dentry), - &context, &contextlen); + &args.context, &args.contextlen); else err = -EOPNOTSUPP; - contextsupport = (err == 0); - if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + args.contextsupport = (err == 0); + if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) - bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL; + u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; else if (err) goto out_nfserr; } } #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2); + /* attrmask */ + status = nfsd4_encode_bitmap4(xdr, u.attrmask[0], + u.attrmask[1], u.attrmask[2]); if (status) goto out; + /* attr_vals */ attrlen_offset = xdr->buf->len; attrlen_p = xdr_reserve_space(xdr, XDR_UNIT); if (!attrlen_p) goto out_resource; - - if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 supp[3]; - - memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); - - if (!IS_POSIXACL(dentry->d_inode)) - supp[0] &= ~FATTR4_WORD0_ACL; - if (!contextsupport) - supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - if (!supp[2]) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(supp[0]); - *p++ = cpu_to_be32(supp[1]); - } else { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(supp[0]); - *p++ = cpu_to_be32(supp[1]); - *p++ = cpu_to_be32(supp[2]); - } - } - if (bmval0 & FATTR4_WORD0_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - dummy = nfs4_file_type(stat.mode); - if (dummy == NF4BAD) { - status = nfserr_serverfault; - goto out; - } - *p++ = cpu_to_be32(dummy); - } - if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) - *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); - else - *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| - NFS4_FH_VOL_RENAME); - } - if (bmval0 & FATTR4_WORD0_CHANGE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = encode_change(p, &stat, d_inode(dentry), exp); - } - if (bmval0 & FATTR4_WORD0_SIZE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, stat.size); - } - if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_FSID) { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - if (exp->ex_fslocs.migrated) { - p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); - p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); - } else switch(fsid_source(fhp)) { - case FSIDSOURCE_FSID: - p = xdr_encode_hyper(p, (u64)exp->ex_fsid); - p = xdr_encode_hyper(p, (u64)0); - break; - case FSIDSOURCE_DEV: - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(MAJOR(stat.dev)); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(MINOR(stat.dev)); - break; - case FSIDSOURCE_UUID: - p = xdr_encode_opaque_fixed(p, exp->ex_uuid, - EX_UUID_LEN); - break; - } - } - if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_LEASE_TIME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(nn->nfsd4_lease); - } - if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(rdattr_err); - } - if (bmval0 & FATTR4_WORD0_ACL) { - struct nfs4_ace *ace; - - if (acl == NULL) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - - *p++ = cpu_to_be32(0); - goto out_acl; - } - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(acl->naces); - - for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { - p = xdr_reserve_space(xdr, 4*3); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(ace->type); - *p++ = cpu_to_be32(ace->flag); - *p++ = cpu_to_be32(ace->access_mask & - NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(xdr, rqstp, ace); - if (status) - goto out; - } - } -out_acl: - if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ? - ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); - } - if (bmval0 & FATTR4_WORD0_CANSETTIME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_FILEHANDLE) { - p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); - if (!p) - goto out_resource; - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, - fhp->fh_handle.fh_size); - } - if (bmval0 & FATTR4_WORD0_FILEID) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, stat.ino); - } - if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_ffree); - } - if (bmval0 & FATTR4_WORD0_FILES_FREE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_ffree); - } - if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_files); - } - if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - status = nfsd4_encode_fs_locations(xdr, rqstp, exp); - if (status) - goto out; - } - if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); - } - if (bmval0 & FATTR4_WORD0_MAXLINK) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(255); - } - if (bmval0 & FATTR4_WORD0_MAXNAME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(statfs.f_namelen); - } - if (bmval0 & FATTR4_WORD0_MAXREAD) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); - } - if (bmval0 & FATTR4_WORD0_MAXWRITE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); - } - if (bmval1 & FATTR4_WORD1_MODE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.mode & S_IALLUGO); - } - if (bmval1 & FATTR4_WORD1_NO_TRUNC) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval1 & FATTR4_WORD1_NUMLINKS) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.nlink); - } - if (bmval1 & FATTR4_WORD1_OWNER) { - status = nfsd4_encode_user(xdr, rqstp, stat.uid); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { - status = nfsd4_encode_group(xdr, rqstp, stat.gid); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_RAWDEV) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); - *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); - } - if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_FREE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_USED) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)stat.blocks << 9; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - status = nfsd4_encode_nfstime4(xdr, &stat.atime); - if (status) + for_each_set_bit(bit, (const unsigned long *)&u.mask, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) { + status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args); + if (status != nfs_ok) goto out; } - if (bmval1 & FATTR4_WORD1_TIME_CREATE) { - status = nfsd4_encode_nfstime4(xdr, &stat.btime); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_TIME_DELTA) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = encode_time_delta(p, d_inode(dentry)); - } - if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - status = nfsd4_encode_nfstime4(xdr, &stat.ctime); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - status = nfsd4_encode_nfstime4(xdr, &stat.mtime); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - u64 ino = stat.ino; - - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - /* - * Get ino of mountpoint in parent filesystem, if not ignoring - * crossmount and this is the root of a cross-mounted - * filesystem. - */ - if (ignore_crossmnt == 0 && - dentry == exp->ex_path.mnt->mnt_root) { - err = nfsd4_get_mounted_on_ino(exp, &ino); - if (err) - goto out_nfserr; - } - p = xdr_encode_hyper(p, ino); - } -#ifdef CONFIG_NFSD_PNFS - if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); - if (status) - goto out; - } - - if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { - status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); - if (status) - goto out; - } - - if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.blksize); - } -#endif /* CONFIG_NFSD_PNFS */ - if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - u32 supp[3]; - - memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); - supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; - supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; - supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; - - status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); - if (status) - goto out; - } - -#ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { - status = nfsd4_encode_security_label(xdr, rqstp, context, - contextlen); - if (status) - goto out; - } -#endif - - if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - err = xattr_supports_user_prefix(d_inode(dentry)); - *p++ = cpu_to_be32(err == 0); - } - *attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT); status = nfs_ok; out: #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if (context) - security_release_secctx(context, contextlen); + if (args.context) + security_release_secctx(args.context, args.contextlen); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - kfree(acl); + kfree(args.acl); if (tempfh) { fh_put(tempfh); kfree(tempfh); @@ -3509,12 +3668,28 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, __be32 ret; svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); - ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, - ignore_crossmnt); + ret = nfsd4_encode_fattr4(rqstp, &xdr, fhp, exp, dentry, bmval, + ignore_crossmnt); *p = xdr.p; return ret; } +/* + * The buffer space for this field was reserved during a previous + * call to nfsd4_encode_entry4(). + */ +static void nfsd4_encode_entry4_nfs_cookie4(const struct nfsd4_readdir *readdir, + u64 offset) +{ + __be64 cookie = cpu_to_be64(offset); + struct xdr_stream *xdr = readdir->xdr; + + if (!readdir->cookie_offset) + return; + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, &cookie, + sizeof(cookie)); +} + static inline int attributes_need_mount(u32 *bmval) { if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) @@ -3525,8 +3700,8 @@ static inline int attributes_need_mount(u32 *bmval) } static __be32 -nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, - const char *name, int namlen) +nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name, + int namlen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; @@ -3569,33 +3744,34 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, } out_encode: - nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, - cd->rd_rqstp, ignore_crossmnt); + nfserr = nfsd4_encode_fattr4(cd->rd_rqstp, cd->xdr, NULL, exp, dentry, + cd->rd_bmval, ignore_crossmnt); out_put: dput(dentry); exp_put(exp); return nfserr; } -static __be32 * -nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) +static __be32 +nfsd4_encode_entry4_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { - __be32 *p; - - p = xdr_reserve_space(xdr, 20); - if (!p) - return NULL; - *p++ = htonl(2); - *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ - *p++ = htonl(0); /* bmval1 */ + __be32 status; - *p++ = htonl(4); /* attribute length */ - *p++ = nfserr; /* no htonl */ - return p; + /* attrmask */ + status = nfsd4_encode_bitmap4(xdr, FATTR4_WORD0_RDATTR_ERROR, 0, 0); + if (status != nfs_ok) + return status; + /* attr_vals */ + if (xdr_stream_encode_u32(xdr, XDR_UNIT) != XDR_UNIT) + return nfserr_resource; + /* rdattr_error */ + if (xdr_stream_encode_be32(xdr, nfserr) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; } static int -nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, +nfsd4_encode_entry4(void *ccdv, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_cd *ccd = ccdv; @@ -3606,8 +3782,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, u32 name_and_cookie; int entry_bytes; __be32 nfserr = nfserr_toosmall; - __be64 wire_offset; - __be32 *p; /* In nfsv4, "." and ".." never make it onto the wire.. */ if (name && isdotent(name, namlen)) { @@ -3615,24 +3789,19 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, return 0; } - if (cd->cookie_offset) { - wire_offset = cpu_to_be64(offset); - write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, - &wire_offset, 8); - } + /* Encode the previous entry's cookie value */ + nfsd4_encode_entry4_nfs_cookie4(cd, offset); - p = xdr_reserve_space(xdr, 4); - if (!p) + if (xdr_stream_encode_item_present(xdr) != XDR_UNIT) goto fail; - *p++ = xdr_one; /* mark entry present */ + + /* Reserve send buffer space for this entry's cookie value. */ cookie_offset = xdr->buf->len; - p = xdr_reserve_space(xdr, 3*4 + namlen); - if (!p) + if (nfsd4_encode_nfs_cookie4(xdr, OFFSET_MAX) != nfs_ok) goto fail; - p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */ - p = xdr_encode_array(p, name, namlen); /* name length & name */ - - nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); + if (nfsd4_encode_component4(xdr, name, namlen) != nfs_ok) + goto fail; + nfserr = nfsd4_encode_entry4_fattr(cd, name, namlen); switch (nfserr) { case nfs_ok: break; @@ -3663,8 +3832,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, */ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) goto fail; - p = nfsd4_encode_rdattr_error(xdr, nfserr); - if (p == NULL) { + if (nfsd4_encode_entry4_rdattr_error(xdr, nfserr)) { nfserr = nfserr_toosmall; goto fail; } @@ -3722,18 +3890,26 @@ nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid) return nfs_ok; } +/* This is a frequently-encoded item; open-coded for speed */ static __be32 -nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +nfsd4_encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) { __be32 *p; - p = xdr_reserve_space(xdr, sizeof(stateid_t)); + p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); if (!p) return nfserr_resource; *p++ = cpu_to_be32(sid->si_generation); - p = xdr_encode_opaque_fixed(p, &sid->si_opaque, - sizeof(stateid_opaque_t)); - return 0; + memcpy(p, &sid->si_opaque, sizeof(sid->si_opaque)); + return nfs_ok; +} + +static __be32 +nfsd4_encode_sessionid4(struct xdr_stream *xdr, + const struct nfs4_sessionid *sessionid) +{ + return nfsd4_encode_opaque_fixed(xdr, sessionid->data, + NFS4_MAX_SESSIONID_LEN); } static __be32 @@ -3742,14 +3918,14 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_access *access = &u->access; struct xdr_stream *xdr = resp->xdr; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(access->ac_supported); - *p++ = cpu_to_be32(access->ac_resp_access); - return 0; + /* supported */ + status = nfsd4_encode_uint32_t(xdr, access->ac_supported); + if (status != nfs_ok) + return status; + /* access */ + return nfsd4_encode_uint32_t(xdr, access->ac_resp_access); } static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, @@ -3757,17 +3933,16 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, { struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); - if (!p) + /* bctsr_sessid */ + nfserr = nfsd4_encode_sessionid4(xdr, &bcts->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* bctsr_dir */ + if (xdr_stream_encode_u32(xdr, bcts->dir) != XDR_UNIT) return nfserr_resource; - p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(bcts->dir); - /* Upshifting from TCP to RDMA is not supported */ - *p++ = cpu_to_be32(0); - return 0; + /* bctsr_use_conn_in_rdma_mode */ + return nfsd4_encode_bool(xdr, false); } static __be32 @@ -3777,7 +3952,8 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close = &u->close; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &close->cl_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &close->cl_stateid); } @@ -3797,11 +3973,13 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; + /* cinfo */ nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo); if (nfserr) return nfserr; - return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], - create->cr_bmval[1], create->cr_bmval[2]); + /* attrset */ + return nfsd4_encode_bitmap4(xdr, create->cr_bmval[0], + create->cr_bmval[1], create->cr_bmval[2]); } static __be32 @@ -3812,65 +3990,56 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp = getattr->ga_fhp; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, - getattr->ga_bmval, resp->rqstp, 0); + /* obj_attributes */ + return nfsd4_encode_fattr4(resp->rqstp, xdr, fhp, fhp->fh_export, + fhp->fh_dentry, getattr->ga_bmval, 0); } static __be32 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { - struct svc_fh **fhpp = &u->getfh; struct xdr_stream *xdr = resp->xdr; - struct svc_fh *fhp = *fhpp; - unsigned int len; - __be32 *p; + struct svc_fh *fhp = u->getfh; - len = fhp->fh_handle.fh_size; - p = xdr_reserve_space(xdr, len + 4); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len); - return 0; + /* object */ + return nfsd4_encode_nfs_fh4(xdr, &fhp->fh_handle); } -/* -* Including all fields other than the name, a LOCK4denied structure requires -* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. -*/ static __be32 -nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) +nfsd4_encode_lock_owner4(struct xdr_stream *xdr, const clientid_t *clientid, + const struct xdr_netobj *owner) { - struct xdr_netobj *conf = &ld->ld_owner; - __be32 *p; + __be32 status; -again: - p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); - if (!p) { - /* - * Don't fail to return the result just because we can't - * return the conflicting open: - */ - if (conf->len) { - kfree(conf->data); - conf->len = 0; - conf->data = NULL; - goto again; - } + /* clientid */ + status = nfsd4_encode_clientid4(xdr, clientid); + if (status != nfs_ok) + return status; + /* owner */ + return nfsd4_encode_opaque(xdr, owner->data, owner->len); +} + +static __be32 +nfsd4_encode_lock4denied(struct xdr_stream *xdr, + const struct nfsd4_lock_denied *ld) +{ + __be32 status; + + /* offset */ + status = nfsd4_encode_offset4(xdr, ld->ld_start); + if (status != nfs_ok) + return status; + /* length */ + status = nfsd4_encode_length4(xdr, ld->ld_length); + if (status != nfs_ok) + return status; + /* locktype */ + if (xdr_stream_encode_u32(xdr, ld->ld_type) != XDR_UNIT) return nfserr_resource; - } - p = xdr_encode_hyper(p, ld->ld_start); - p = xdr_encode_hyper(p, ld->ld_length); - *p++ = cpu_to_be32(ld->ld_type); - if (conf->len) { - p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); - p = xdr_encode_opaque(p, conf->data, conf->len); - kfree(conf->data); - } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ - p = xdr_encode_hyper(p, (u64)0); /* clientid */ - *p++ = cpu_to_be32(0); /* length of owner name */ - } - return nfserr_denied; + /* owner */ + return nfsd4_encode_lock_owner4(xdr, &ld->ld_clientid, + &ld->ld_owner); } static __be32 @@ -3879,13 +4048,21 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_lock *lock = &u->lock; struct xdr_stream *xdr = resp->xdr; + __be32 status; - if (!nfserr) - nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); - else if (nfserr == nfserr_denied) - nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); - - return nfserr; + switch (nfserr) { + case nfs_ok: + /* resok4 */ + status = nfsd4_encode_stateid4(xdr, &lock->lk_resp_stateid); + break; + case nfserr_denied: + /* denied */ + status = nfsd4_encode_lock4denied(xdr, &lock->lk_denied); + break; + default: + return nfserr; + } + return status != nfs_ok ? status : nfserr; } static __be32 @@ -3894,9 +4071,14 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_lockt *lockt = &u->lockt; struct xdr_stream *xdr = resp->xdr; + __be32 status; - if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); + if (nfserr == nfserr_denied) { + /* denied */ + status = nfsd4_encode_lock4denied(xdr, &lockt->lt_denied); + if (status != nfs_ok) + return status; + } return nfserr; } @@ -3907,7 +4089,8 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku = &u->locku; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &locku->lu_stateid); + /* lock_stateid */ + return nfsd4_encode_stateid4(xdr, &locku->lu_stateid); } @@ -3921,101 +4104,159 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, return nfsd4_encode_change_info4(xdr, &link->li_cinfo); } +/* + * This implementation does not yet support returning an ACE in an + * OPEN that offers a delegation. + */ +static __be32 +nfsd4_encode_open_nfsace4(struct xdr_stream *xdr) +{ + __be32 status; + + /* type */ + status = nfsd4_encode_acetype4(xdr, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + if (status != nfs_ok) + return nfserr_resource; + /* flag */ + status = nfsd4_encode_aceflag4(xdr, 0); + if (status != nfs_ok) + return nfserr_resource; + /* access mask */ + status = nfsd4_encode_acemask4(xdr, 0); + if (status != nfs_ok) + return nfserr_resource; + /* who - empty for now */ + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; +} static __be32 -nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_open_read_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open) { - struct nfsd4_open *open = &u->open; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; + __be32 status; - nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); - if (nfserr) - return nfserr; - nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); - if (nfserr) - return nfserr; - if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0) + /* stateid */ + status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid); + if (status != nfs_ok) + return status; + /* recall */ + status = nfsd4_encode_bool(xdr, open->op_recall); + if (status != nfs_ok) + return status; + /* permissions */ + return nfsd4_encode_open_nfsace4(xdr); +} + +static __be32 +nfsd4_encode_nfs_space_limit4(struct xdr_stream *xdr, u64 filesize) +{ + /* limitby */ + if (xdr_stream_encode_u32(xdr, NFS4_LIMIT_SIZE) != XDR_UNIT) return nfserr_resource; + /* filesize */ + return nfsd4_encode_uint64_t(xdr, filesize); +} - nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], - open->op_bmval[2]); - if (nfserr) - return nfserr; +static __be32 +nfsd4_encode_open_write_delegation4(struct xdr_stream *xdr, + struct nfsd4_open *open) +{ + __be32 status; - p = xdr_reserve_space(xdr, 4); - if (!p) + /* stateid */ + status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid); + if (status != nfs_ok) + return status; + /* recall */ + status = nfsd4_encode_bool(xdr, open->op_recall); + if (status != nfs_ok) + return status; + /* space_limit */ + status = nfsd4_encode_nfs_space_limit4(xdr, 0); + if (status != nfs_ok) + return status; + return nfsd4_encode_open_nfsace4(xdr); +} + +static __be32 +nfsd4_encode_open_none_delegation4(struct xdr_stream *xdr, + struct nfsd4_open *open) +{ + __be32 status = nfs_ok; + + /* ond_why */ + if (xdr_stream_encode_u32(xdr, open->op_why_no_deleg) != XDR_UNIT) return nfserr_resource; + switch (open->op_why_no_deleg) { + case WND4_CONTENTION: + /* ond_server_will_push_deleg */ + status = nfsd4_encode_bool(xdr, false); + break; + case WND4_RESOURCE: + /* ond_server_will_signal_avail */ + status = nfsd4_encode_bool(xdr, false); + } + return status; +} + +static __be32 +nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open) +{ + __be32 status; - *p++ = cpu_to_be32(open->op_delegate_type); + /* delegation_type */ + if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT) + return nfserr_resource; switch (open->op_delegate_type) { case NFS4_OPEN_DELEGATE_NONE: + status = nfs_ok; break; case NFS4_OPEN_DELEGATE_READ: - nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_recall); - - /* - * TODO: ACE's in delegations - */ - *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + /* read */ + status = nfsd4_encode_open_read_delegation4(xdr, open); break; case NFS4_OPEN_DELEGATE_WRITE: - nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 32); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_recall); - - /* - * TODO: space_limit's in delegations - */ - *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); - *p++ = cpu_to_be32(~(u32)0); - *p++ = cpu_to_be32(~(u32)0); - - /* - * TODO: ACE's in delegations - */ - *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + /* write */ + status = nfsd4_encode_open_write_delegation4(xdr, open); break; - case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ - switch (open->op_why_no_deleg) { - case WND4_CONTENTION: - case WND4_RESOURCE: - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_why_no_deleg); - /* deleg signaling not supported yet: */ - *p++ = cpu_to_be32(0); - break; - default: - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_why_no_deleg); - } + case NFS4_OPEN_DELEGATE_NONE_EXT: + /* od_whynone */ + status = nfsd4_encode_open_none_delegation4(xdr, open); break; default: - BUG(); + status = nfserr_serverfault; } - /* XXX save filehandle here */ - return 0; + + return status; +} + +static __be32 +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_open *open = &u->open; + struct xdr_stream *xdr = resp->xdr; + + /* stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &open->op_stateid); + if (nfserr != nfs_ok) + return nfserr; + /* cinfo */ + nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); + if (nfserr != nfs_ok) + return nfserr; + /* rflags */ + nfserr = nfsd4_encode_uint32_t(xdr, open->op_rflags); + if (nfserr != nfs_ok) + return nfserr; + /* attrset */ + nfserr = nfsd4_encode_bitmap4(xdr, open->op_bmval[0], + open->op_bmval[1], open->op_bmval[2]); + if (nfserr != nfs_ok) + return nfserr; + /* delegation */ + return nfsd4_encode_open_delegation4(xdr, open); } static __be32 @@ -4025,7 +4266,8 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc = &u->open_confirm; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &oc->oc_resp_stateid); } static __be32 @@ -4035,7 +4277,8 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od = &u->open_downgrade; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &od->od_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &od->od_stateid); } /* @@ -4105,6 +4348,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, struct file *file, unsigned long maxcount) { struct xdr_stream *xdr = resp->xdr; + unsigned int base = xdr->buf->page_len & ~PAGE_MASK; unsigned int starting_len = xdr->buf->len; __be32 zero = xdr_zero; __be32 nfserr; @@ -4113,8 +4357,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, return nfserr_resource; nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file, - read->rd_offset, &maxcount, - xdr->buf->page_len & ~PAGE_MASK, + read->rd_offset, &maxcount, base, &read->rd_eof); read->rd_length = maxcount; if (nfserr) @@ -4219,90 +4462,83 @@ out_err: return nfserr; } -static __be32 -nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +static __be32 nfsd4_encode_dirlist4(struct xdr_stream *xdr, + struct nfsd4_readdir *readdir, + u32 max_payload) { - struct nfsd4_readdir *readdir = &u->readdir; - int maxcount; - int bytes_left; + int bytes_left, maxcount, starting_len = xdr->buf->len; loff_t offset; - __be64 wire_offset; - struct xdr_stream *xdr = resp->xdr; - int starting_len = xdr->buf->len; - __be32 *p; - - nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); - if (nfserr != nfs_ok) - return nfserr; + __be32 status; /* * Number of bytes left for directory entries allowing for the - * final 8 bytes of the readdir and a following failed op: + * final 8 bytes of the readdir and a following failed op. */ - bytes_left = xdr->buf->buflen - xdr->buf->len - - COMPOUND_ERR_SLACK_SPACE - 8; - if (bytes_left < 0) { - nfserr = nfserr_resource; - goto err_no_verf; - } - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(u32, readdir->rd_maxcount, maxcount); + bytes_left = xdr->buf->buflen - xdr->buf->len - + COMPOUND_ERR_SLACK_SPACE - XDR_UNIT * 2; + if (bytes_left < 0) + return nfserr_resource; + maxcount = min_t(u32, readdir->rd_maxcount, max_payload); + /* - * Note the rfc defines rd_maxcount as the size of the - * READDIR4resok structure, which includes the verifier above - * and the 8 bytes encoded at the end of this function: + * The RFC defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier + * and the 8 bytes encoded at the end of this function. */ - if (maxcount < 16) { - nfserr = nfserr_toosmall; - goto err_no_verf; - } - maxcount = min_t(int, maxcount-16, bytes_left); + if (maxcount < XDR_UNIT * 4) + return nfserr_toosmall; + maxcount = min_t(int, maxcount - XDR_UNIT * 4, bytes_left); - /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0 */ if (!readdir->rd_dircount) - readdir->rd_dircount = svc_max_payload(resp->rqstp); + readdir->rd_dircount = max_payload; + /* *entries */ readdir->xdr = xdr; readdir->rd_maxcount = maxcount; readdir->common.err = 0; readdir->cookie_offset = 0; - offset = readdir->rd_cookie; - nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, - &offset, - &readdir->common, nfsd4_encode_dirent); - if (nfserr == nfs_ok && - readdir->common.err == nfserr_toosmall && - xdr->buf->len == starting_len + 8) { - /* nothing encoded; which limit did we hit?: */ - if (maxcount - 16 < bytes_left) - /* It was the fault of rd_maxcount: */ - nfserr = nfserr_toosmall; - else - /* We ran out of buffer space: */ - nfserr = nfserr_resource; + status = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, &offset, + &readdir->common, nfsd4_encode_entry4); + if (status) + return status; + if (readdir->common.err == nfserr_toosmall && + xdr->buf->len == starting_len) { + /* No entries were encoded. Which limit did we hit? */ + if (maxcount - XDR_UNIT * 4 < bytes_left) + /* It was the fault of rd_maxcount */ + return nfserr_toosmall; + /* We ran out of buffer space */ + return nfserr_resource; } - if (nfserr) - goto err_no_verf; + /* Encode the final entry's cookie value */ + nfsd4_encode_entry4_nfs_cookie4(readdir, offset); + /* No entries follow */ + if (xdr_stream_encode_item_absent(xdr) != XDR_UNIT) + return nfserr_resource; - if (readdir->cookie_offset) { - wire_offset = cpu_to_be64(offset); - write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, - &wire_offset, 8); - } + /* eof */ + return nfsd4_encode_bool(xdr, readdir->common.err == nfserr_eof); +} - p = xdr_reserve_space(xdr, 8); - if (!p) { - WARN_ON_ONCE(1); - goto err_no_verf; - } - *p++ = 0; /* no more entries */ - *p++ = htonl(readdir->common.err == nfserr_eof); +static __be32 +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_readdir *readdir = &u->readdir; + struct xdr_stream *xdr = resp->xdr; + int starting_len = xdr->buf->len; - return 0; -err_no_verf: - xdr_truncate_encode(xdr, starting_len); + /* cookieverf */ + nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); + if (nfserr != nfs_ok) + return nfserr; + + /* reply */ + nfserr = nfsd4_encode_dirlist4(xdr, readdir, svc_max_payload(resp->rqstp)); + if (nfserr != nfs_ok) + xdr_truncate_encode(xdr, starting_len); return nfserr; } @@ -4330,13 +4566,34 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 +nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr, + struct rpcsec_gss_info *info) +{ + __be32 status; + + /* oid */ + if (xdr_stream_encode_opaque(xdr, info->oid.data, info->oid.len) < 0) + return nfserr_resource; + /* qop */ + status = nfsd4_encode_qop4(xdr, info->qop); + if (status != nfs_ok) + return status; + /* service */ + if (xdr_stream_encode_u32(xdr, info->service) != XDR_UNIT) + return nfserr_resource; + + return nfs_ok; +} + +static __be32 nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) { u32 i, nflavs, supported; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; - __be32 *p, *flavorsp; static bool report = true; + __be32 *flavorsp; + __be32 status; if (exp->ex_nflavors) { flavs = exp->ex_flavors; @@ -4359,10 +4616,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) } supported = 0; - p = xdr_reserve_space(xdr, 4); - if (!p) + flavorsp = xdr_reserve_space(xdr, XDR_UNIT); + if (!flavorsp) return nfserr_resource; - flavorsp = p++; /* to be backfilled later */ for (i = 0; i < nflavs; i++) { rpc_authflavor_t pf = flavs[i].pseudoflavor; @@ -4370,20 +4626,22 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) if (rpcauth_get_gssinfo(pf, &info) == 0) { supported++; - p = xdr_reserve_space(xdr, 4 + 4 + - XDR_LEN(info.oid.len) + 4 + 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(RPC_AUTH_GSS); - p = xdr_encode_opaque(p, info.oid.data, info.oid.len); - *p++ = cpu_to_be32(info.qop); - *p++ = cpu_to_be32(info.service); + + /* flavor */ + status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS); + if (status != nfs_ok) + return status; + /* flavor_info */ + status = nfsd4_encode_rpcsec_gss_info(xdr, &info); + if (status != nfs_ok) + return status; } else if (pf < RPC_AUTH_MAXFLAVOR) { supported++; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(pf); + + /* flavor */ + status = nfsd4_encode_uint32_t(xdr, pf); + if (status != nfs_ok) + return status; } else { if (report) pr_warn("NFS: SECINFO: security flavor %u " @@ -4393,7 +4651,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) if (nflavs != supported) report = false; - *flavorsp = htonl(supported); + *flavorsp = cpu_to_be32(supported); return 0; } @@ -4417,34 +4675,25 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); } -/* - * The SETATTR encode routine is special -- it always encodes a bitmap, - * regardless of the error status. - */ static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_setattr *setattr = &u->setattr; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 16); - if (!p) - return nfserr_resource; - if (nfserr) { - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - } - else { - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(setattr->sa_bmval[0]); - *p++ = cpu_to_be32(setattr->sa_bmval[1]); - *p++ = cpu_to_be32(setattr->sa_bmval[2]); + switch (nfserr) { + case nfs_ok: + /* attrsset */ + status = nfsd4_encode_bitmap4(resp->xdr, setattr->sa_bmval[0], + setattr->sa_bmval[1], + setattr->sa_bmval[2]); + break; + default: + /* attrsset */ + status = nfsd4_encode_bitmap4(resp->xdr, 0, 0, 0); } - return nfserr; + return status != nfs_ok ? status : nfserr; } static __be32 @@ -4480,86 +4729,148 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; + struct xdr_stream *xdr = resp->xdr; - if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0) - return nfserr_resource; - if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0) + /* count */ + nfserr = nfsd4_encode_count4(xdr, write->wr_bytes_written); + if (nfserr) + return nfserr; + /* committed */ + if (xdr_stream_encode_u32(xdr, write->wr_how_written) != XDR_UNIT) return nfserr_resource; - return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier); + /* writeverf */ + return nfsd4_encode_verifier4(xdr, &write->wr_verifier); } static __be32 -nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_state_protect_ops4(struct xdr_stream *xdr, + struct nfsd4_exchange_id *exid) { - struct nfsd4_exchange_id *exid = &u->exchange_id; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - char *major_id; - char *server_scope; - int major_id_sz; - int server_scope_sz; - uint64_t minor_id = 0; - struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); + __be32 status; - major_id = nn->nfsd_name; - major_id_sz = strlen(nn->nfsd_name); - server_scope = nn->nfsd_name; - server_scope_sz = strlen(nn->nfsd_name); + /* spo_must_enforce */ + status = nfsd4_encode_bitmap4(xdr, exid->spo_must_enforce[0], + exid->spo_must_enforce[1], + exid->spo_must_enforce[2]); + if (status != nfs_ok) + return status; + /* spo_must_allow */ + return nfsd4_encode_bitmap4(xdr, exid->spo_must_allow[0], + exid->spo_must_allow[1], + exid->spo_must_allow[2]); +} - if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok) - return nfserr_resource; - if (xdr_stream_encode_u32(xdr, exid->seqid) < 0) - return nfserr_resource; - if (xdr_stream_encode_u32(xdr, exid->flags) < 0) - return nfserr_resource; +static __be32 +nfsd4_encode_state_protect4_r(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid) +{ + __be32 status; - if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0) + if (xdr_stream_encode_u32(xdr, exid->spa_how) != XDR_UNIT) return nfserr_resource; switch (exid->spa_how) { case SP4_NONE: + status = nfs_ok; break; case SP4_MACH_CRED: - /* spo_must_enforce bitmap: */ - nfserr = nfsd4_encode_bitmap(xdr, - exid->spo_must_enforce[0], - exid->spo_must_enforce[1], - exid->spo_must_enforce[2]); - if (nfserr) - return nfserr; - /* spo_must_allow bitmap: */ - nfserr = nfsd4_encode_bitmap(xdr, - exid->spo_must_allow[0], - exid->spo_must_allow[1], - exid->spo_must_allow[2]); - if (nfserr) - return nfserr; + /* spr_mach_ops */ + status = nfsd4_encode_state_protect_ops4(xdr, exid); break; default: - WARN_ON_ONCE(1); + status = nfserr_serverfault; } + return status; +} - p = xdr_reserve_space(xdr, - 8 /* so_minor_id */ + - 4 /* so_major_id.len */ + - (XDR_QUADLEN(major_id_sz) * 4) + - 4 /* eir_server_scope.len */ + - (XDR_QUADLEN(server_scope_sz) * 4) + - 4 /* eir_server_impl_id.count (0) */); - if (!p) +static __be32 +nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 status; + + /* so_minor_id */ + status = nfsd4_encode_uint64_t(xdr, 0); + if (status != nfs_ok) + return status; + /* so_major_id */ + return nfsd4_encode_opaque(xdr, nn->nfsd_name, strlen(nn->nfsd_name)); +} + +static __be32 +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); + struct nfsd4_exchange_id *exid = &u->exchange_id; + struct xdr_stream *xdr = resp->xdr; + + /* eir_clientid */ + nfserr = nfsd4_encode_clientid4(xdr, &exid->clientid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_sequenceid */ + nfserr = nfsd4_encode_sequenceid4(xdr, exid->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, exid->flags); + if (nfserr != nfs_ok) + return nfserr; + /* eir_state_protect */ + nfserr = nfsd4_encode_state_protect4_r(xdr, exid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_owner */ + nfserr = nfsd4_encode_server_owner4(xdr, resp->rqstp); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_scope */ + nfserr = nfsd4_encode_opaque(xdr, nn->nfsd_name, + strlen(nn->nfsd_name)); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_impl_id<1> */ + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) return nfserr_resource; - /* The server_owner struct */ - p = xdr_encode_hyper(p, minor_id); /* Minor id */ - /* major id */ - p = xdr_encode_opaque(p, major_id, major_id_sz); + return nfs_ok; +} - /* Server scope */ - p = xdr_encode_opaque(p, server_scope, server_scope_sz); +static __be32 +nfsd4_encode_channel_attrs4(struct xdr_stream *xdr, + const struct nfsd4_channel_attrs *attrs) +{ + __be32 status; - /* Implementation id */ - *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ - return 0; + /* ca_headerpadsize */ + status = nfsd4_encode_count4(xdr, 0); + if (status != nfs_ok) + return status; + /* ca_maxrequestsize */ + status = nfsd4_encode_count4(xdr, attrs->maxreq_sz); + if (status != nfs_ok) + return status; + /* ca_maxresponsesize */ + status = nfsd4_encode_count4(xdr, attrs->maxresp_sz); + if (status != nfs_ok) + return status; + /* ca_maxresponsesize_cached */ + status = nfsd4_encode_count4(xdr, attrs->maxresp_cached); + if (status != nfs_ok) + return status; + /* ca_maxoperations */ + status = nfsd4_encode_count4(xdr, attrs->maxops); + if (status != nfs_ok) + return status; + /* ca_maxrequests */ + status = nfsd4_encode_count4(xdr, attrs->maxreqs); + if (status != nfs_ok) + return status; + /* ca_rdma_ird<1> */ + if (xdr_stream_encode_u32(xdr, attrs->nr_rdma_attrs) != XDR_UNIT) + return nfserr_resource; + if (attrs->nr_rdma_attrs) + return nfsd4_encode_uint32_t(xdr, attrs->rdma_attrs); + return nfs_ok; } static __be32 @@ -4568,52 +4879,25 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_create_session *sess = &u->create_session; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - - p = xdr_reserve_space(xdr, 24); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, sess->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(sess->seqid); - *p++ = cpu_to_be32(sess->flags); - - p = xdr_reserve_space(xdr, 28); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); /* headerpadsz */ - *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); - *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); - *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); - *p++ = cpu_to_be32(sess->fore_channel.maxops); - *p++ = cpu_to_be32(sess->fore_channel.maxreqs); - *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); - - if (sess->fore_channel.nr_rdma_attrs) { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); - } - p = xdr_reserve_space(xdr, 28); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); /* headerpadsz */ - *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); - *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); - *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); - *p++ = cpu_to_be32(sess->back_channel.maxops); - *p++ = cpu_to_be32(sess->back_channel.maxreqs); - *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); - - if (sess->back_channel.nr_rdma_attrs) { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); - } - return 0; + /* csr_sessionid */ + nfserr = nfsd4_encode_sessionid4(xdr, &sess->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* csr_sequence */ + nfserr = nfsd4_encode_sequenceid4(xdr, sess->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* csr_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, sess->flags); + if (nfserr != nfs_ok) + return nfserr; + /* csr_fore_chan_attrs */ + nfserr = nfsd4_encode_channel_attrs4(xdr, &sess->fore_channel); + if (nfserr != nfs_ok) + return nfserr; + /* csr_back_chan_attrs */ + return nfsd4_encode_channel_attrs4(xdr, &sess->back_channel); } static __be32 @@ -4622,22 +4906,35 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_sequence *seq = &u->sequence; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, seq->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(seq->seqid); - *p++ = cpu_to_be32(seq->slotid); + /* sr_sessionid */ + nfserr = nfsd4_encode_sessionid4(xdr, &seq->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* sr_sequenceid */ + nfserr = nfsd4_encode_sequenceid4(xdr, seq->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* sr_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->slotid); + if (nfserr != nfs_ok) + return nfserr; /* Note slotid's are numbered from zero: */ - *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ - *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ - *p++ = cpu_to_be32(seq->status_flags); + /* sr_highest_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1); + if (nfserr != nfs_ok) + return nfserr; + /* sr_target_highest_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1); + if (nfserr != nfs_ok) + return nfserr; + /* sr_status_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, seq->status_flags); + if (nfserr != nfs_ok) + return nfserr; resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ - return 0; + return nfs_ok; } static __be32 @@ -4645,128 +4942,132 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; - struct xdr_stream *xdr = resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; - __be32 *p; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); - if (!p) + /* tsr_status_codes<> */ + if (xdr_stream_encode_u32(xdr, test_stateid->ts_num_ids) != XDR_UNIT) return nfserr_resource; - *p++ = htonl(test_stateid->ts_num_ids); - - list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { - *p++ = stateid->ts_id_status; + list_for_each_entry_safe(stateid, next, + &test_stateid->ts_stateid_list, ts_id_list) { + if (xdr_stream_encode_be32(xdr, stateid->ts_id_status) != XDR_UNIT) + return nfserr_resource; } - - return 0; + return nfs_ok; } #ifdef CONFIG_NFSD_PNFS static __be32 -nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_device_addr4(struct xdr_stream *xdr, + const struct nfsd4_getdeviceinfo *gdev) { - struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; - struct xdr_stream *xdr = resp->xdr; + u32 needed_len, starting_len = xdr->buf->len; const struct nfsd4_layout_ops *ops; - u32 starting_len = xdr->buf->len, needed_len; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 4); - if (!p) + /* da_layout_type */ + if (xdr_stream_encode_u32(xdr, gdev->gd_layout_type) != XDR_UNIT) return nfserr_resource; - - *p++ = cpu_to_be32(gdev->gd_layout_type); - - /* If maxcount is 0 then just update notifications */ - if (gdev->gd_maxcount != 0) { - ops = nfsd4_layout_ops[gdev->gd_layout_type]; - nfserr = ops->encode_getdeviceinfo(xdr, gdev); - if (nfserr) { - /* - * We don't bother to burden the layout drivers with - * enforcing gd_maxcount, just tell the client to - * come back with a bigger buffer if it's not enough. - */ - if (xdr->buf->len + 4 > gdev->gd_maxcount) - goto toosmall; - return nfserr; - } + /* da_addr_body */ + ops = nfsd4_layout_ops[gdev->gd_layout_type]; + status = ops->encode_getdeviceinfo(xdr, gdev); + if (status != nfs_ok) { + /* + * Don't burden the layout drivers with enforcing + * gd_maxcount. Just tell the client to come back + * with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + XDR_UNIT > gdev->gd_maxcount) + goto toosmall; + return status; } - if (gdev->gd_notify_types) { - p = xdr_reserve_space(xdr, 4 + 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(gdev->gd_notify_types); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = 0; - } + return nfs_ok; - return 0; toosmall: - dprintk("%s: maxcount too small\n", __func__); - needed_len = xdr->buf->len + 4 /* notifications */; + needed_len = xdr->buf->len + XDR_UNIT; /* notifications */ xdr_truncate_encode(xdr, starting_len); - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(needed_len); + + status = nfsd4_encode_count4(xdr, needed_len); + if (status != nfs_ok) + return status; return nfserr_toosmall; } static __be32 -nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, +nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { - struct nfsd4_layoutget *lgp = &u->layoutget; + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; struct xdr_stream *xdr = resp->xdr; - const struct nfsd4_layout_ops *ops; - __be32 *p; - - p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); /* we always set return-on-close */ - *p++ = cpu_to_be32(lgp->lg_sid.si_generation); - p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, - sizeof(stateid_opaque_t)); + /* gdir_device_addr */ + nfserr = nfsd4_encode_device_addr4(xdr, gdev); + if (nfserr) + return nfserr; + /* gdir_notification */ + return nfsd4_encode_bitmap4(xdr, gdev->gd_notify_types, 0, 0); +} - *p++ = cpu_to_be32(1); /* we always return a single layout */ - p = xdr_encode_hyper(p, lgp->lg_seg.offset); - p = xdr_encode_hyper(p, lgp->lg_seg.length); - *p++ = cpu_to_be32(lgp->lg_seg.iomode); - *p++ = cpu_to_be32(lgp->lg_layout_type); +static __be32 +nfsd4_encode_layout4(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp) +{ + const struct nfsd4_layout_ops *ops = nfsd4_layout_ops[lgp->lg_layout_type]; + __be32 status; - ops = nfsd4_layout_ops[lgp->lg_layout_type]; + /* lo_offset */ + status = nfsd4_encode_offset4(xdr, lgp->lg_seg.offset); + if (status != nfs_ok) + return status; + /* lo_length */ + status = nfsd4_encode_length4(xdr, lgp->lg_seg.length); + if (status != nfs_ok) + return status; + /* lo_iomode */ + if (xdr_stream_encode_u32(xdr, lgp->lg_seg.iomode) != XDR_UNIT) + return nfserr_resource; + /* lo_content */ + if (xdr_stream_encode_u32(xdr, lgp->lg_layout_type) != XDR_UNIT) + return nfserr_resource; return ops->encode_layoutget(xdr, lgp); } static __be32 +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_layoutget *lgp = &u->layoutget; + struct xdr_stream *xdr = resp->xdr; + + /* logr_return_on_close */ + nfserr = nfsd4_encode_bool(xdr, true); + if (nfserr != nfs_ok) + return nfserr; + /* logr_stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &lgp->lg_sid); + if (nfserr != nfs_ok) + return nfserr; + /* logr_layout<> */ + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) + return nfserr_resource; + return nfsd4_encode_layout4(xdr, lgp); +} + +static __be32 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_layoutcommit *lcp = &u->layoutcommit; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(lcp->lc_size_chg); - if (lcp->lc_size_chg) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - p = xdr_encode_hyper(p, lcp->lc_newsize); - } - - return 0; + /* ns_sizechanged */ + nfserr = nfsd4_encode_bool(xdr, lcp->lc_size_chg); + if (nfserr != nfs_ok) + return nfserr; + if (lcp->lc_size_chg) + /* ns_size */ + return nfsd4_encode_length4(xdr, lcp->lc_newsize); + return nfs_ok; } static __be32 @@ -4775,103 +5076,108 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_layoutreturn *lrp = &u->layoutreturn; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(lrp->lrs_present); + /* lrs_present */ + nfserr = nfsd4_encode_bool(xdr, lrp->lrs_present); + if (nfserr != nfs_ok) + return nfserr; if (lrp->lrs_present) - return nfsd4_encode_stateid(xdr, &lrp->lr_sid); - return 0; + /* lrs_stateid */ + return nfsd4_encode_stateid4(xdr, &lrp->lr_sid); + return nfs_ok; } #endif /* CONFIG_NFSD_PNFS */ static __be32 -nfsd42_encode_write_res(struct nfsd4_compoundres *resp, - struct nfsd42_write_res *write, bool sync) +nfsd4_encode_write_response4(struct xdr_stream *xdr, + const struct nfsd4_copy *copy) { - __be32 *p; - p = xdr_reserve_space(resp->xdr, 4); - if (!p) - return nfserr_resource; + const struct nfsd42_write_res *write = ©->cp_res; + u32 count = nfsd4_copy_is_sync(copy) ? 0 : 1; + __be32 status; - if (sync) - *p++ = cpu_to_be32(0); - else { - __be32 nfserr; - *p++ = cpu_to_be32(1); - nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid); - if (nfserr) - return nfserr; + /* wr_callback_id<1> */ + if (xdr_stream_encode_u32(xdr, count) != XDR_UNIT) + return nfserr_resource; + if (count) { + status = nfsd4_encode_stateid4(xdr, &write->cb_stateid); + if (status != nfs_ok) + return status; } - p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); - if (!p) + + /* wr_count */ + status = nfsd4_encode_length4(xdr, write->wr_bytes_written); + if (status != nfs_ok) + return status; + /* wr_committed */ + if (xdr_stream_encode_u32(xdr, write->wr_stable_how) != XDR_UNIT) return nfserr_resource; + /* wr_writeverf */ + return nfsd4_encode_verifier4(xdr, &write->wr_verifier); +} - p = xdr_encode_hyper(p, write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_stable_how); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - return nfs_ok; +static __be32 nfsd4_encode_copy_requirements4(struct xdr_stream *xdr, + const struct nfsd4_copy *copy) +{ + __be32 status; + + /* cr_consecutive */ + status = nfsd4_encode_bool(xdr, true); + if (status != nfs_ok) + return status; + /* cr_synchronous */ + return nfsd4_encode_bool(xdr, nfsd4_copy_is_sync(copy)); } static __be32 -nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = resp->xdr; - struct nfs42_netaddr *addr; - __be32 *p; + struct nfsd4_copy *copy = &u->copy; - p = xdr_reserve_space(xdr, 4); - *p++ = cpu_to_be32(ns->nl4_type); + nfserr = nfsd4_encode_write_response4(resp->xdr, copy); + if (nfserr != nfs_ok) + return nfserr; + return nfsd4_encode_copy_requirements4(resp->xdr, copy); +} +static __be32 +nfsd4_encode_netloc4(struct xdr_stream *xdr, const struct nl4_server *ns) +{ + __be32 status; + + if (xdr_stream_encode_u32(xdr, ns->nl4_type) != XDR_UNIT) + return nfserr_resource; switch (ns->nl4_type) { case NL4_NETADDR: - addr = &ns->u.nl4_addr; - - /* netid_len, netid, uaddr_len, uaddr (port included - * in RPCBIND_MAXUADDRLEN) - */ - p = xdr_reserve_space(xdr, - 4 /* netid len */ + - (XDR_QUADLEN(addr->netid_len) * 4) + - 4 /* uaddr len */ + - (XDR_QUADLEN(addr->addr_len) * 4)); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(addr->netid_len); - p = xdr_encode_opaque_fixed(p, addr->netid, - addr->netid_len); - *p++ = cpu_to_be32(addr->addr_len); - p = xdr_encode_opaque_fixed(p, addr->addr, - addr->addr_len); + /* nl_addr */ + status = nfsd4_encode_netaddr4(xdr, &ns->u.nl4_addr); break; default: - WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR); - return nfserr_inval; + status = nfserr_serverfault; } - - return 0; + return status; } static __be32 -nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct nfsd4_copy *copy = &u->copy; - __be32 *p; + struct nfsd4_copy_notify *cn = &u->copy_notify; + struct xdr_stream *xdr = resp->xdr; - nfserr = nfsd42_encode_write_res(resp, ©->cp_res, - nfsd4_copy_is_sync(copy)); + /* cnr_lease_time */ + nfserr = nfsd4_encode_nfstime4(xdr, &cn->cpn_lease_time); if (nfserr) return nfserr; - - p = xdr_reserve_space(resp->xdr, 4 + 4); - *p++ = xdr_one; /* cr_consecutive */ - *p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero; - return 0; + /* cnr_stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &cn->cpn_cnr_stateid); + if (nfserr) + return nfserr; + /* cnr_source_server<> */ + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) + return nfserr_resource; + return nfsd4_encode_netloc4(xdr, cn->cpn_src); } static __be32 @@ -4880,14 +5186,15 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_offload_status *os = &u->offload_status; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 8 + 4); - if (!p) + /* osr_count */ + nfserr = nfsd4_encode_length4(xdr, os->count); + if (nfserr != nfs_ok) + return nfserr; + /* osr_complete<1> */ + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) return nfserr_resource; - p = xdr_encode_hyper(p, os->count); - *p++ = cpu_to_be32(0); - return nfserr; + return nfs_ok; } static __be32 @@ -4965,53 +5272,18 @@ out: } static __be32 -nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) -{ - struct nfsd4_copy_notify *cn = &u->copy_notify; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - - if (nfserr) - return nfserr; - - /* 8 sec, 4 nsec */ - p = xdr_reserve_space(xdr, 12); - if (!p) - return nfserr_resource; - - /* cnr_lease_time */ - p = xdr_encode_hyper(p, cn->cpn_sec); - *p++ = cpu_to_be32(cn->cpn_nsec); - - /* cnr_stateid */ - nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid); - if (nfserr) - return nfserr; - - /* cnr_src.nl_nsvr */ - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(1); - - nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src); - return nfserr; -} - -static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_seek *seek = &u->seek; - __be32 *p; - - p = xdr_reserve_space(resp->xdr, 4 + 8); - *p++ = cpu_to_be32(seek->seek_eof); - p = xdr_encode_hyper(p, seek->seek_pos); + struct xdr_stream *xdr = resp->xdr; - return 0; + /* sr_eof */ + nfserr = nfsd4_encode_bool(xdr, seek->seek_eof); + if (nfserr != nfs_ok) + return nfserr; + /* sr_offset */ + return nfsd4_encode_offset4(xdr, seek->seek_pos); } static __be32 diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index a8eda1c85829..d3273a396659 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -84,11 +84,11 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } -static struct svc_cacherep * -nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, - struct nfsd_net *nn) +static struct nfsd_cacherep * +nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum, + struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; rp = kmem_cache_alloc(drc_slab, GFP_KERNEL); if (rp) { @@ -110,36 +110,64 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum, return rp; } -static void -nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, - struct nfsd_net *nn) +static void nfsd_cacherep_free(struct nfsd_cacherep *rp) { - if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { - nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); + if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); + kmem_cache_free(drc_slab, rp); +} + +static unsigned long +nfsd_cacherep_dispose(struct list_head *dispose) +{ + struct nfsd_cacherep *rp; + unsigned long freed = 0; + + while (!list_empty(dispose)) { + rp = list_first_entry(dispose, struct nfsd_cacherep, c_lru); + list_del(&rp->c_lru); + nfsd_cacherep_free(rp); + freed++; } + return freed; +} + +static void +nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + struct nfsd_cacherep *rp) +{ + if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) + nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len); if (rp->c_state != RC_UNUSED) { rb_erase(&rp->c_node, &b->rb_head); list_del(&rp->c_lru); atomic_dec(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp)); } - kmem_cache_free(drc_slab, rp); } static void -nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp, +nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, + struct nfsd_net *nn) +{ + nfsd_cacherep_unlink_locked(nn, b, rp); + nfsd_cacherep_free(rp); +} + +static void +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, struct nfsd_net *nn) { spin_lock(&b->cache_lock); - nfsd_reply_cache_free_locked(b, rp, nn); + nfsd_cacherep_unlink_locked(nn, b, rp); spin_unlock(&b->cache_lock); + nfsd_cacherep_free(rp); } int nfsd_drc_slab_create(void) { drc_slab = kmem_cache_create("nfsd_drc", - sizeof(struct svc_cacherep), 0, 0, NULL); + sizeof(struct nfsd_cacherep), 0, 0, NULL); return drc_slab ? 0: -ENOMEM; } @@ -173,26 +201,29 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; unsigned int i; - int status = 0; nn->max_drc_entries = nfsd_cache_size_limit(); atomic_set(&nn->num_drc_entries, 0); hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); - nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; - nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; - nn->nfsd_reply_cache_shrinker.seeks = 1; - status = register_shrinker(&nn->nfsd_reply_cache_shrinker, - "nfsd-reply:%s", nn->nfsd_name); - if (status) - return status; - nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); if (!nn->drc_hashtbl) + return -ENOMEM; + + nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s", + nn->nfsd_name); + if (!nn->nfsd_reply_cache_shrinker) goto out_shrinker; + nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan; + nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count; + nn->nfsd_reply_cache_shrinker->seeks = 1; + nn->nfsd_reply_cache_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_reply_cache_shrinker); + for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); spin_lock_init(&nn->drc_hashtbl[i].cache_lock); @@ -201,22 +232,22 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + kvfree(nn->drc_hashtbl); printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; } void nfsd_reply_cache_shutdown(struct nfsd_net *nn) { - struct svc_cacherep *rp; + struct nfsd_cacherep *rp; unsigned int i; - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + shrinker_free(nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { struct list_head *head = &nn->drc_hashtbl[i].lru_head; while (!list_empty(head)) { - rp = list_first_entry(head, struct svc_cacherep, c_lru); + rp = list_first_entry(head, struct nfsd_cacherep, c_lru); nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i], rp, nn); } @@ -233,7 +264,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) * not already scheduled. */ static void -lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) +lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp) { rp->c_timestamp = jiffies; list_move_tail(&rp->c_lru, &b->lru_head); @@ -247,12 +278,21 @@ nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn) return &nn->drc_hashtbl[hash]; } -static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, - unsigned int max) +/* + * Remove and return no more than @max expired entries in bucket @b. + * If @max is zero, do not limit the number of removed entries. + */ +static void +nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, + unsigned int max, struct list_head *dispose) { - struct svc_cacherep *rp, *tmp; - long freed = 0; + unsigned long expiry = jiffies - RC_EXPIRE; + struct nfsd_cacherep *rp, *tmp; + unsigned int freed = 0; + + lockdep_assert_held(&b->cache_lock); + /* The bucket LRU is ordered oldest-first. */ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { /* * Don't free entries attached to calls that are still @@ -260,87 +300,121 @@ static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn, */ if (rp->c_state == RC_INPROG) continue; + if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && - time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) + time_before(expiry, rp->c_timestamp)) break; - nfsd_reply_cache_free_locked(b, rp, nn); - if (max && freed++ > max) + + nfsd_cacherep_unlink_locked(nn, b, rp); + list_add(&rp->c_lru, dispose); + + if (max && ++freed > max) break; } - return freed; } -static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn) +/** + * nfsd_reply_cache_count - count_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Returns the total number of entries in the duplicate reply cache. To + * keep things simple and quick, this is not the number of expired entries + * in the cache (ie, the number that would be removed by a call to + * nfsd_reply_cache_scan). + */ +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - return prune_bucket(b, nn, 3); + struct nfsd_net *nn = shrink->private_data; + + return atomic_read(&nn->num_drc_entries); } -/* - * Walk the LRU list and prune off entries that are older than RC_EXPIRE. - * Also prune the oldest ones when the total exceeds the max number of entries. +/** + * nfsd_reply_cache_scan - scan_objects method for the DRC shrinker + * @shrink: our registered shrinker context + * @sc: garbage collection parameters + * + * Free expired entries on each bucket's LRU list until we've released + * nr_to_scan freed objects. Nothing will be released if the cache + * has not exceeded it's max_drc_entries limit. + * + * Returns the number of entries released by this call. */ -static long -prune_cache_entries(struct nfsd_net *nn) +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { + struct nfsd_net *nn = shrink->private_data; + unsigned long freed = 0; + LIST_HEAD(dispose); unsigned int i; - long freed = 0; for (i = 0; i < nn->drc_hashsize; i++) { struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i]; if (list_empty(&b->lru_head)) continue; + spin_lock(&b->cache_lock); - freed += prune_bucket(b, nn, 0); + nfsd_prune_bucket_locked(nn, b, 0, &dispose); spin_unlock(&b->cache_lock); - } - return freed; -} -static unsigned long -nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + freed += nfsd_cacherep_dispose(&dispose); + if (freed > sc->nr_to_scan) + break; + } - return atomic_read(&nn->num_drc_entries); + trace_nfsd_drc_gc(nn, freed); + return freed; } -static unsigned long -nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); - - return prune_cache_entries(nn); -} -/* - * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes +/** + * nfsd_cache_csum - Checksum incoming NFS Call arguments + * @buf: buffer containing a whole RPC Call message + * @start: starting byte of the NFS Call header + * @remaining: size of the NFS Call header, in bytes + * + * Compute a weak checksum of the leading bytes of an NFS procedure + * call header to help verify that a retransmitted Call matches an + * entry in the duplicate reply cache. + * + * To avoid assumptions about how the RPC message is laid out in + * @buf and what else it might contain (eg, a GSS MIC suffix), the + * caller passes us the exact location and length of the NFS Call + * header. + * + * Returns a 32-bit checksum value, as defined in RFC 793. */ -static __wsum -nfsd_cache_csum(struct svc_rqst *rqstp) +static __wsum nfsd_cache_csum(struct xdr_buf *buf, unsigned int start, + unsigned int remaining) { + unsigned int base, len; + struct xdr_buf subbuf; + __wsum csum = 0; + void *p; int idx; - unsigned int base; - __wsum csum; - struct xdr_buf *buf = &rqstp->rq_arg; - const unsigned char *p = buf->head[0].iov_base; - size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len, - RC_CSUMLEN); - size_t len = min(buf->head[0].iov_len, csum_len); + + if (remaining > RC_CSUMLEN) + remaining = RC_CSUMLEN; + if (xdr_buf_subsegment(buf, &subbuf, start, remaining)) + return csum; /* rq_arg.head first */ - csum = csum_partial(p, len, 0); - csum_len -= len; + if (subbuf.head[0].iov_len) { + len = min_t(unsigned int, subbuf.head[0].iov_len, remaining); + csum = csum_partial(subbuf.head[0].iov_base, len, csum); + remaining -= len; + } /* Continue into page array */ - idx = buf->page_base / PAGE_SIZE; - base = buf->page_base & ~PAGE_MASK; - while (csum_len) { - p = page_address(buf->pages[idx]) + base; - len = min_t(size_t, PAGE_SIZE - base, csum_len); + idx = subbuf.page_base / PAGE_SIZE; + base = subbuf.page_base & ~PAGE_MASK; + while (remaining) { + p = page_address(subbuf.pages[idx]) + base; + len = min_t(unsigned int, PAGE_SIZE - base, remaining); csum = csum_partial(p, len, csum); - csum_len -= len; + remaining -= len; base = 0; ++idx; } @@ -348,8 +422,8 @@ nfsd_cache_csum(struct svc_rqst *rqstp) } static int -nfsd_cache_key_cmp(const struct svc_cacherep *key, - const struct svc_cacherep *rp, struct nfsd_net *nn) +nfsd_cache_key_cmp(const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp, struct nfsd_net *nn) { if (key->c_key.k_xid == rp->c_key.k_xid && key->c_key.k_csum != rp->c_key.k_csum) { @@ -365,11 +439,11 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key, * Must be called with cache_lock held. Returns the found entry or * inserts an empty key on failure. */ -static struct svc_cacherep * -nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, +static struct nfsd_cacherep * +nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key, struct nfsd_net *nn) { - struct svc_cacherep *rp, *ret = key; + struct nfsd_cacherep *rp, *ret = key; struct rb_node **p = &b->rb_head.rb_node, *parent = NULL; unsigned int entries = 0; @@ -378,7 +452,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key, while (*p != NULL) { ++entries; parent = *p; - rp = rb_entry(parent, struct svc_cacherep, c_node); + rp = rb_entry(parent, struct nfsd_cacherep, c_node); cmp = nfsd_cache_key_cmp(key, rp, nn); if (cmp < 0) @@ -411,6 +485,9 @@ out: /** * nfsd_cache_lookup - Find an entry in the duplicate reply cache * @rqstp: Incoming Call to find + * @start: starting byte in @rqstp->rq_arg of the NFS Call header + * @len: size of the NFS Call header, in bytes + * @cacherep: OUT: DRC entry for this request * * Try to find an entry matching the current call in the cache. When none * is found, we try to grab the oldest expired entry off the LRU list. If @@ -423,29 +500,31 @@ out: * %RC_REPLY: Reply from cache * %RC_DROPIT: Do not process the request further */ -int nfsd_cache_lookup(struct svc_rqst *rqstp) +int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, + unsigned int len, struct nfsd_cacherep **cacherep) { struct nfsd_net *nn; - struct svc_cacherep *rp, *found; + struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; int type = rqstp->rq_cachetype; + unsigned long freed; + LIST_HEAD(dispose); int rtn = RC_DOIT; - rqstp->rq_cacherep = NULL; if (type == RC_NOCACHE) { nfsd_stats_rc_nocache_inc(); goto out; } - csum = nfsd_cache_csum(rqstp); + csum = nfsd_cache_csum(&rqstp->rq_arg, start, len); /* * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - rp = nfsd_reply_cache_alloc(rqstp, csum, nn); + rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; @@ -454,20 +533,18 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp) found = nfsd_cache_insert(b, rp, nn); if (found != rp) goto found_entry; - - nfsd_stats_rc_misses_inc(); - rqstp->rq_cacherep = rp; + *cacherep = rp; rp->c_state = RC_INPROG; + nfsd_prune_bucket_locked(nn, b, 3, &dispose); + spin_unlock(&b->cache_lock); + freed = nfsd_cacherep_dispose(&dispose); + trace_nfsd_drc_gc(nn, freed); + + nfsd_stats_rc_misses_inc(); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); - - nfsd_prune_bucket(b, nn); - -out_unlock: - spin_unlock(&b->cache_lock); -out: - return rtn; + goto out; found_entry: /* We found a matching entry which is either in progress or done. */ @@ -505,12 +582,16 @@ found_entry: out_trace: trace_nfsd_drc_found(nn, rqstp, rtn); - goto out_unlock; +out_unlock: + spin_unlock(&b->cache_lock); +out: + return rtn; } /** * nfsd_cache_update - Update an entry in the duplicate reply cache. * @rqstp: svc_rqst with a finished Reply + * @rp: IN: DRC entry for this request * @cachetype: which cache to update * @statp: pointer to Reply's NFS status code, or NULL * @@ -528,10 +609,10 @@ out_trace: * nfsd failed to encode a reply that otherwise would have been cached. * In this case, nfsd_cache_update is called with statp == NULL. */ -void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) +void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp, + int cachetype, __be32 *statp) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; struct nfsd_drc_bucket *b; int len; @@ -582,24 +663,17 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) return; } -/* - * Copy cached reply to current reply buffer. Should always fit. - * FIXME as reply is in a page, we should just attach the page, and - * keep a refcount.... - */ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) { - struct kvec *vec = &rqstp->rq_res.head[0]; - - if (vec->iov_len + data->iov_len > PAGE_SIZE) { - printk(KERN_WARNING "nfsd: cached reply too large (%zd).\n", - data->iov_len); - return 0; - } - memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len); - vec->iov_len += data->iov_len; - return 1; + __be32 *p; + + p = xdr_reserve_space(&rqstp->rq_res_stream, data->iov_len); + if (unlikely(!p)) + return false; + memcpy(p, data->iov_base, data->iov_len); + xdr_commit_encode(&rqstp->rq_res_stream); + return true; } /* diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 4302ca0ff6ed..7cd513e59305 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -26,6 +26,7 @@ #include "pnfs.h" #include "filecache.h" #include "trace.h" +#include "netlink.h" /* * We have a single directory with several nodes in it. @@ -704,8 +705,10 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred err = svc_addsock(nn->nfsd_serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred); - if (err >= 0 && - !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) + if (err < 0 && !nn->nfsd_serv->sv_nrthreads && !nn->keep_active) + nfsd_last_thread(net); + else if (err >= 0 && + !nn->nfsd_serv->sv_nrthreads && !xchg(&nn->keep_active, 1)) svc_get(nn->nfsd_serv); nfsd_put(net); @@ -756,6 +759,9 @@ out_close: svc_xprt_put(xprt); } out_err: + if (!nn->nfsd_serv->sv_nrthreads && !nn->keep_active) + nfsd_last_thread(net); + nfsd_put(net); return err; } @@ -1132,7 +1138,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) /* Following advice from simple_fill_super documentation: */ inode->i_ino = iunique(sb, NFSD_MaxReserved); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; @@ -1496,6 +1502,200 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; /** + * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit + * @cb: netlink metadata and command arguments + * + * Return values: + * %0: The rpc_status_get command may proceed + * %-ENODEV: There is no NFSD running in this namespace + */ +int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb) +{ + struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); + int ret = -ENODEV; + + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv) + ret = 0; + else + mutex_unlock(&nfsd_mutex); + + return ret; +} + +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, + struct netlink_callback *cb, + struct nfsd_genl_rqstp *rqstp) +{ + void *hdr; + u32 i; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET); + if (!hdr) + return -ENOBUFS; + + if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) || + nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) || + nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME, + ktime_to_us(rqstp->rq_stime), + NFSD_A_RPC_STATUS_PAD)) + return -ENOBUFS; + + switch (rqstp->rq_saddr.sa_family) { + case AF_INET: { + const struct sockaddr_in *s_in, *d_in; + + s_in = (const struct sockaddr_in *)&rqstp->rq_saddr; + d_in = (const struct sockaddr_in *)&rqstp->rq_daddr; + if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4, + s_in->sin_addr.s_addr) || + nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4, + d_in->sin_addr.s_addr) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, + s_in->sin_port) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, + d_in->sin_port)) + return -ENOBUFS; + break; + } + case AF_INET6: { + const struct sockaddr_in6 *s_in, *d_in; + + s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr; + d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr; + if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6, + &s_in->sin6_addr) || + nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6, + &d_in->sin6_addr) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, + s_in->sin6_port) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, + d_in->sin6_port)) + return -ENOBUFS; + break; + } + } + + for (i = 0; i < rqstp->rq_opcnt; i++) + if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS, + rqstp->rq_opnum[i])) + return -ENOBUFS; + + genlmsg_end(skb, hdr); + return 0; +} + +/** + * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit + * @skb: reply buffer + * @cb: netlink metadata and command arguments + * + * Returns the size of the reply or a negative errno. + */ +int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); + int i, ret, rqstp_index = 0; + + rcu_read_lock(); + + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { + struct svc_rqst *rqstp; + + if (i < cb->args[0]) /* already consumed */ + continue; + + rqstp_index = 0; + list_for_each_entry_rcu(rqstp, + &nn->nfsd_serv->sv_pools[i].sp_all_threads, + rq_all) { + struct nfsd_genl_rqstp genl_rqstp; + unsigned int status_counter; + + if (rqstp_index++ < cb->args[1]) /* already consumed */ + continue; + /* + * Acquire rq_status_counter before parsing the rqst + * fields. rq_status_counter is set to an odd value in + * order to notify the consumers the rqstp fields are + * meaningful. + */ + status_counter = + smp_load_acquire(&rqstp->rq_status_counter); + if (!(status_counter & 1)) + continue; + + genl_rqstp.rq_xid = rqstp->rq_xid; + genl_rqstp.rq_flags = rqstp->rq_flags; + genl_rqstp.rq_vers = rqstp->rq_vers; + genl_rqstp.rq_prog = rqstp->rq_prog; + genl_rqstp.rq_proc = rqstp->rq_proc; + genl_rqstp.rq_stime = rqstp->rq_stime; + genl_rqstp.rq_opcnt = 0; + memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp), + sizeof(struct sockaddr)); + memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp), + sizeof(struct sockaddr)); + +#ifdef CONFIG_NFSD_V4 + if (rqstp->rq_vers == NFS4_VERSION && + rqstp->rq_proc == NFSPROC4_COMPOUND) { + /* NFSv4 compound */ + struct nfsd4_compoundargs *args; + int j; + + args = rqstp->rq_argp; + genl_rqstp.rq_opcnt = args->opcnt; + for (j = 0; j < genl_rqstp.rq_opcnt; j++) + genl_rqstp.rq_opnum[j] = + args->ops[j].opnum; + } +#endif /* CONFIG_NFSD_V4 */ + + /* + * Acquire rq_status_counter before reporting the rqst + * fields to the user. + */ + if (smp_load_acquire(&rqstp->rq_status_counter) != + status_counter) + continue; + + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, + &genl_rqstp); + if (ret) + goto out; + } + } + + cb->args[0] = i; + cb->args[1] = rqstp_index; + ret = skb->len; +out: + rcu_read_unlock(); + + return ret; +} + +/** + * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing + * @cb: netlink metadata and command arguments + * + * Return values: + * %0: Success + */ +int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb) +{ + mutex_unlock(&nfsd_mutex); + + return 0; +} + +/** * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace * @net: a freshly-created network namespace * @@ -1589,6 +1789,10 @@ static int __init init_nfsd(void) retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_all; + retval = genl_register_family(&nfsd_nl_family); + if (retval) + goto out_free_all; + return 0; out_free_all: nfsd4_destroy_laundry_wq(); @@ -1613,6 +1817,7 @@ out_free_slabs: static void __exit exit_nfsd(void) { + genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); @@ -1627,6 +1832,7 @@ static void __exit exit_nfsd(void) } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); +MODULE_DESCRIPTION("In-kernel NFS server"); MODULE_LICENSE("GPL"); module_init(init_nfsd) module_exit(exit_nfsd) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index d88498f8b275..3286ffacbc56 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -62,6 +62,23 @@ struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 50 + +struct nfsd_genl_rqstp { + struct sockaddr rq_daddr; + struct sockaddr rq_saddr; + unsigned long rq_flags; + ktime_t rq_stime; + __be32 rq_xid; + u32 rq_vers; + u32 rq_prog; + u32 rq_proc; + + /* NFSv4 compound */ + u32 rq_opcnt; + u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND]; +}; extern struct svc_program nfsd_program; extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; @@ -96,7 +113,12 @@ int nfsd_pool_stats_open(struct inode *, struct file *); int nfsd_pool_stats_release(struct inode *, struct file *); void nfsd_shutdown_threads(struct net *net); -void nfsd_put(struct net *net); +static inline void nfsd_put(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + svc_put(nn->nfsd_serv); +} bool i_am_nfsd(void); @@ -133,6 +155,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change); int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change); void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); +void nfsd_last_thread(struct net *net); extern int nfsd_max_blksize; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c291389a1d71..dbfa0ac13564 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -614,7 +614,7 @@ out_negative: * @fhp: file handle to be updated * */ -void fh_fill_pre_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode; @@ -622,12 +622,12 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) __be32 err; if (fhp->fh_no_wcc || fhp->fh_pre_saved) - return; + return nfs_ok; inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); if (err) - return; + return err; if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -636,6 +636,7 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) fhp->fh_pre_ctime = stat.ctime; fhp->fh_pre_size = stat.size; fhp->fh_pre_saved = true; + return nfs_ok; } /** @@ -643,26 +644,27 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) * @fhp: file handle to be updated * */ -void fh_fill_post_attrs(struct svc_fh *fhp) +__be32 fh_fill_post_attrs(struct svc_fh *fhp) { bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE); struct inode *inode = d_inode(fhp->fh_dentry); __be32 err; if (fhp->fh_no_wcc) - return; + return nfs_ok; if (fhp->fh_post_saved) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); if (err) - return; + return err; fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); + return nfs_ok; } /** @@ -672,16 +674,20 @@ void fh_fill_post_attrs(struct svc_fh *fhp) * This is used when the directory wasn't changed, but wcc attributes * are needed anyway. */ -void fh_fill_both_attrs(struct svc_fh *fhp) +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp) { - fh_fill_post_attrs(fhp); - if (!fhp->fh_post_saved) - return; + __be32 err; + + err = fh_fill_post_attrs(fhp); + if (err) + return err; + fhp->fh_pre_change = fhp->fh_post_change; fhp->fh_pre_mtime = fhp->fh_post_attr.mtime; fhp->fh_pre_ctime = fhp->fh_post_attr.ctime; fhp->fh_pre_size = fhp->fh_post_attr.size; fhp->fh_pre_saved = true; + return nfs_ok; } /* @@ -765,7 +771,7 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) * assume that the new change attr is always logged to stable storage in some * fashion before the results can be seen. */ -u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) +u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode) { u64 chattr; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 4e0ecf0ae2cf..6ebdf7ea27bf 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -293,8 +293,9 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); -extern void fh_fill_pre_attrs(struct svc_fh *fhp); -extern void fh_fill_post_attrs(struct svc_fh *fhp); -extern void fh_fill_both_attrs(struct svc_fh *fhp); +u64 nfsd4_change_attribute(const struct kstat *stat, + const struct inode *inode); +__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp); +__be32 fh_fill_post_attrs(struct svc_fh *fhp); +__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp); #endif /* _LINUX_NFSD_NFSFH_H */ diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 2154fa63c5f2..7a2bc8e82a63 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -542,9 +542,14 @@ static struct notifier_block nfsd_inet6addr_notifier = { /* Only used under nfsd_mutex, so this atomic may be overkill: */ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); -static void nfsd_last_thread(struct svc_serv *serv, struct net *net) +void nfsd_last_thread(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + + spin_lock(&nfsd_notifier_lock); + nn->nfsd_serv = NULL; + spin_unlock(&nfsd_notifier_lock); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { @@ -554,6 +559,8 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) #endif } + svc_xprt_destroy_all(serv, net); + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -565,7 +572,6 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) return; nfsd_shutdown_net(net); - pr_info("nfsd: last server has exited, flushing export cache\n"); nfsd_export_flush(net); } @@ -644,7 +650,8 @@ void nfsd_shutdown_threads(struct net *net) svc_get(serv); /* Kill outstanding nfsd threads */ svc_set_num_threads(serv, NULL, 0); - nfsd_put(net); + nfsd_last_thread(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); } @@ -674,9 +681,6 @@ int nfsd_create_serv(struct net *net) serv->sv_maxconn = nn->max_connections; error = svc_bind(serv, net); if (error < 0) { - /* NOT nfsd_put() as notifiers (see below) haven't - * been set up yet. - */ svc_put(serv); return error; } @@ -708,40 +712,16 @@ int nfsd_nrpools(struct net *net) int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { - int i = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + int i; - if (nn->nfsd_serv != NULL) { - for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++) - nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads; - } - + if (serv) + for (i = 0; i < serv->sv_nrpools && i < n; i++) + nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads); return 0; } -/* This is the callback for kref_put() below. - * There is no code here as the first thing to be done is - * call svc_shutdown_net(), but we cannot get the 'net' from - * the kref. So do all the work when kref_put returns true. - */ -static void nfsd_noop(struct kref *ref) -{ -} - -void nfsd_put(struct net *net) -{ - struct nfsd_net *nn = net_generic(net, nfsd_net_id); - - if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) { - svc_xprt_destroy_all(nn->nfsd_serv, net); - nfsd_last_thread(nn->nfsd_serv, net); - svc_destroy(&nn->nfsd_serv->sv_refcnt); - spin_lock(&nfsd_notifier_lock); - nn->nfsd_serv = NULL; - spin_unlock(&nfsd_notifier_lock); - } -} - int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) { int i = 0; @@ -792,7 +772,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net) if (err) break; } - nfsd_put(net); + svc_put(nn->nfsd_serv); return err; } @@ -805,8 +785,8 @@ int nfsd_svc(int nrservs, struct net *net, const struct cred *cred) { int error; - bool nfsd_up_before; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -824,24 +804,23 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) error = nfsd_create_serv(net); if (error) goto out; - - nfsd_up_before = nn->nfsd_net_up; + serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); if (error) goto out_put; - error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs); + error = svc_set_num_threads(serv, NULL, nrservs); if (error) - goto out_shutdown; - error = nn->nfsd_serv->sv_nrthreads; -out_shutdown: - if (error < 0 && !nfsd_up_before) - nfsd_shutdown_net(net); + goto out_put; + error = serv->sv_nrthreads; out_put: /* Threads now hold service active */ if (xchg(&nn->keep_active, 0)) - nfsd_put(net); - nfsd_put(net); + svc_put(serv); + + if (serv->sv_nrthreads == 0) + nfsd_last_thread(net); + svc_put(serv); out: mutex_unlock(&nfsd_mutex); return error; @@ -953,7 +932,6 @@ nfsd(void *vrqstp) struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); struct net *net = perm_sock->xpt_net; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - int err; /* At this point, the thread shares current->fs * with the init process. We need to create files with the @@ -965,15 +943,6 @@ nfsd(void *vrqstp) current->fs->umask = 0; - /* - * thread is spawned with all signals set to SIG_IGN, re-enable - * the ones that will bring down the thread - */ - allow_signal(SIGKILL); - allow_signal(SIGHUP); - allow_signal(SIGINT); - allow_signal(SIGQUIT); - atomic_inc(&nfsdstats.th_cnt); set_freezable(); @@ -981,54 +950,18 @@ nfsd(void *vrqstp) /* * The main request loop */ - for (;;) { + while (!svc_thread_should_stop(rqstp)) { /* Update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nn->max_connections; - /* - * Find a socket with data available and call its - * recvfrom routine. - */ - while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) - ; - if (err == -EINTR) - break; - validate_process_creds(); - svc_process(rqstp); - validate_process_creds(); + svc_recv(rqstp); } - /* Clear signals before calling svc_exit_thread() */ - flush_signals(current); - atomic_dec(&nfsdstats.th_cnt); out: - /* Take an extra ref so that the svc_put in svc_exit_thread() - * doesn't call svc_destroy() - */ - svc_get(nn->nfsd_serv); - /* Release the thread */ svc_exit_thread(rqstp); - - /* We need to drop a ref, but may not drop the last reference - * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that - * could deadlock with nfsd_shutdown_threads() waiting for us. - * So three options are: - * - drop a non-final reference, - * - get the mutex without waiting - * - sleep briefly andd try the above again - */ - while (!svc_put_not_last(nn->nfsd_serv)) { - if (mutex_trylock(&nfsd_mutex)) { - nfsd_put(net); - mutex_unlock(&nfsd_mutex); - break; - } - msleep(20); - } - return 0; } @@ -1046,6 +979,9 @@ int nfsd_dispatch(struct svc_rqst *rqstp) { const struct svc_procedure *proc = rqstp->rq_procinfo; __be32 *statp = rqstp->rq_accept_statp; + struct nfsd_cacherep *rp; + unsigned int start, len; + __be32 *nfs_reply; /* * Give the xdr decoder a chance to change this if it wants @@ -1053,10 +989,27 @@ int nfsd_dispatch(struct svc_rqst *rqstp) */ rqstp->rq_cachetype = proc->pc_cachetype; + /* + * ->pc_decode advances the argument stream past the NFS + * Call header, so grab the header's starting location and + * size now for the call to nfsd_cache_lookup(). + */ + start = xdr_stream_pos(&rqstp->rq_arg_stream); + len = xdr_stream_remaining(&rqstp->rq_arg_stream); if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; - switch (nfsd_cache_lookup(rqstp)) { + /* + * Release rq_status_counter setting it to an odd value after the rpc + * request has been properly parsed. rq_status_counter is used to + * notify the consumers if the rqstp fields are stable + * (rq_status_counter is odd) or not meaningful (rq_status_counter + * is even). + */ + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); + + rp = NULL; + switch (nfsd_cache_lookup(rqstp, start, len, &rp)) { case RC_DOIT: break; case RC_REPLY: @@ -1065,6 +1018,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp) goto out_dropit; } + nfs_reply = xdr_inline_decode(&rqstp->rq_res_stream, 0); *statp = proc->pc_func(rqstp); if (test_bit(RQ_DROPME, &rqstp->rq_flags)) goto out_update_drop; @@ -1072,7 +1026,13 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; - nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1); + /* + * Release rq_status_counter setting it to an even value after the rpc + * request has been properly processed. + */ + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); + + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, nfs_reply); out_cached_reply: return 1; @@ -1082,13 +1042,13 @@ out_decode_err: return 1; out_update_drop: - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); out_dropit: return 0; out_encode_err: trace_nfsd_cant_encode_err(rqstp); - nfsd_cache_update(rqstp, RC_NOCACHE, NULL); + nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL); *statp = rpc_system_err; return 1; } @@ -1139,11 +1099,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { + struct seq_file *seq = file->private_data; + struct svc_serv *serv = seq->private; int ret = seq_release(inode, file); - struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - nfsd_put(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 4f4282d4eeca..de1e0dfed06a 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -27,12 +27,12 @@ struct nfsd4_layout_ops { struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdevp); __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdevp); + const struct nfsd4_getdeviceinfo *gdevp); __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, struct nfsd4_layoutget *lgp); - __be32 (*encode_layoutget)(struct xdr_stream *, - struct nfsd4_layoutget *lgp); + __be32 (*encode_layoutget)(struct xdr_stream *xdr, + const struct nfsd4_layoutget *lgp); __be32 (*proc_layoutcommit)(struct inode *inode, struct nfsd4_layoutcommit *lcp); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index d49d3060ed4f..41bdc913fa71 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 -/* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 50 /* Maximum session per slot cache size */ #define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ @@ -732,4 +730,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); return clp->cl_state == NFSD4_EXPIRABLE; } + +extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, + struct inode *inode); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 777e24e5da33..12d79f5d4eb1 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -60,11 +60,13 @@ static int nfsd_show(struct seq_file *seq, void *v) #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ - seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); } + seq_printf(seq, "\nwdeleg_getattr %lld", + percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif @@ -74,7 +76,7 @@ static int nfsd_show(struct seq_file *seq, void *v) DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); -int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) +int nfsd_percpu_counters_init(struct percpu_counter *counters, int num) { int i, err = 0; diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 9b43dc3d9991..14f50c660b61 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -22,6 +22,7 @@ enum { NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, #define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ #endif NFSD_STATS_COUNTERS_NUM }; @@ -36,9 +37,9 @@ extern struct nfsd_stats nfsdstats; extern struct svc_stat nfsd_svcstats; -int nfsd_percpu_counters_init(struct percpu_counter counters[], int num); -void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num); -void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num); +int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); +void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); +void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); int nfsd_stat_init(void); void nfsd_stat_shutdown(void); @@ -60,22 +61,22 @@ static inline void nfsd_stats_rc_nocache_inc(void) static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) { percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); - if (exp) - percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]); + if (exp && exp->ex_stats) + percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); } static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); - if (exp) - percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); } static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); - if (exp) - percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); } static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) @@ -93,4 +94,10 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); } +#ifdef CONFIG_NFSD_V4 +static inline void nfsd_stats_wdeleg_getattr_inc(void) +{ + percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); +} +#endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 2af74983f146..fbc0ccb40424 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -607,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); +DEFINE_STATEID_EVENT(deleg_write); DEFINE_STATEID_EVENT(deleg_return); DEFINE_STATEID_EVENT(deleg_recall); @@ -1240,8 +1241,8 @@ TRACE_EVENT(nfsd_drc_found, TRACE_EVENT(nfsd_drc_mismatch, TP_PROTO( const struct nfsd_net *nn, - const struct svc_cacherep *key, - const struct svc_cacherep *rp + const struct nfsd_cacherep *key, + const struct nfsd_cacherep *rp ), TP_ARGS(nn, key, rp), TP_STRUCT__entry( @@ -1261,6 +1262,28 @@ TRACE_EVENT(nfsd_drc_mismatch, __entry->ingress) ); +TRACE_EVENT_CONDITION(nfsd_drc_gc, + TP_PROTO( + const struct nfsd_net *nn, + unsigned long freed + ), + TP_ARGS(nn, freed), + TP_CONDITION(freed > 0), + TP_STRUCT__entry( + __field(unsigned long long, boot_time) + __field(unsigned long, freed) + __field(int, total) + ), + TP_fast_assign( + __entry->boot_time = nn->boot_time; + __entry->freed = freed; + __entry->total = atomic_read(&nn->num_drc_entries); + ), + TP_printk("boot_time=%16llx total=%d freed=%lu", + __entry->boot_time, __entry->total, __entry->freed + ) +); + TRACE_EVENT(nfsd_cb_args, TP_PROTO( const struct nfs4_client *clp, @@ -1840,6 +1863,93 @@ TRACE_EVENT(nfsd_end_grace, ) ); +DECLARE_EVENT_CLASS(nfsd_copy_class, + TP_PROTO( + const struct nfsd4_copy *copy + ), + TP_ARGS(copy), + TP_STRUCT__entry( + __field(bool, intra) + __field(bool, async) + __field(u32, src_cl_boot) + __field(u32, src_cl_id) + __field(u32, src_so_id) + __field(u32, src_si_generation) + __field(u32, dst_cl_boot) + __field(u32, dst_cl_id) + __field(u32, dst_so_id) + __field(u32, dst_si_generation) + __field(u64, src_cp_pos) + __field(u64, dst_cp_pos) + __field(u64, cp_count) + __sockaddr(addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + const stateid_t *src_stp = ©->cp_src_stateid; + const stateid_t *dst_stp = ©->cp_dst_stateid; + + __entry->intra = test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + __entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot; + __entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id; + __entry->src_so_id = src_stp->si_opaque.so_id; + __entry->src_si_generation = src_stp->si_generation; + __entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot; + __entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id; + __entry->dst_so_id = dst_stp->si_opaque.so_id; + __entry->dst_si_generation = dst_stp->si_generation; + __entry->src_cp_pos = copy->cp_src_pos; + __entry->dst_cp_pos = copy->cp_dst_pos; + __entry->cp_count = copy->cp_count; + __assign_sockaddr(addr, ©->cp_clp->cl_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("client=%pISpc intra=%d async=%d " + "src_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] " + "dst_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] " + "cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu", + __get_sockaddr(addr), __entry->intra, __entry->async, + __entry->src_si_generation, __entry->src_cl_boot, + __entry->src_cl_id, __entry->src_so_id, + __entry->dst_si_generation, __entry->dst_cl_boot, + __entry->dst_cl_id, __entry->dst_so_id, + __entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count + ) +); + +#define DEFINE_COPY_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name, \ + TP_PROTO(const struct nfsd4_copy *copy), \ + TP_ARGS(copy)) + +DEFINE_COPY_EVENT(inter); +DEFINE_COPY_EVENT(intra); +DEFINE_COPY_EVENT(do_async); + +TRACE_EVENT(nfsd_copy_done, + TP_PROTO( + const struct nfsd4_copy *copy, + __be32 status + ), + TP_ARGS(copy, status), + TP_STRUCT__entry( + __field(int, status) + __field(bool, intra) + __field(bool, async) + __sockaddr(addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->status = be32_to_cpu(status); + __entry->intra = test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + __assign_sockaddr(addr, ©->cp_clp->cl_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc status=%d intra=%d async=%d ", + __get_sockaddr(addr), __entry->status, __entry->intra, __entry->async + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2c9074ab2315..e01e4e2acbd9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -337,6 +337,24 @@ out: return err; } +static void +commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp, + int err) +{ + switch (err) { + case -EAGAIN: + case -ESTALE: + /* + * Neither of these are the result of a problem with + * durable storage, so avoid a write verifier reset. + */ + break; + default: + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, err); + } +} + /* * Commit metadata changes to stable storage. */ @@ -520,7 +538,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_sanitize_attrs(inode, iap); - if (check_guard && guardtime != inode->i_ctime.tv_sec) + if (check_guard && guardtime != inode_get_ctime_sec(inode)) return nfserr_notsync; /* @@ -647,8 +665,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, &nfsd4_get_cstate(rqstp)->current_fh, dst_pos, count, status); - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, status); + commit_reset_write_verifier(nn, rqstp, status); ret = nfserrno(status); } } @@ -823,7 +840,7 @@ int nfsd_open_break_lease(struct inode *inode, int access) * and additional flags. * N.B. After this call fhp needs an fh_put */ -static __be32 +static int __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { @@ -831,14 +848,12 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, struct inode *inode; struct file *file; int flags = O_RDONLY|O_LARGEFILE; - __be32 err; - int host_err = 0; + int host_err = -EPERM; path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); - err = nfserr_perm; if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; @@ -847,7 +862,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, host_err = nfsd_open_break_lease(inode, may_flags); if (host_err) /* NOMEM or WOULDBLOCK */ - goto out_nfserr; + goto out; if (may_flags & NFSD_MAY_WRITE) { if (may_flags & NFSD_MAY_READ) @@ -859,13 +874,13 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, file = dentry_open(&path, flags, current_cred()); if (IS_ERR(file)) { host_err = PTR_ERR(file); - goto out_nfserr; + goto out; } host_err = ima_file_check(file, may_flags); if (host_err) { fput(file); - goto out_nfserr; + goto out; } if (may_flags & NFSD_MAY_64BIT_COOKIE) @@ -874,10 +889,8 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, file->f_mode |= FMODE_32BITHASH; *filp = file; -out_nfserr: - err = nfserrno(host_err); out: - return err; + return host_err; } __be32 @@ -885,9 +898,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { __be32 err; + int host_err; bool retried = false; - validate_process_creds(); /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE @@ -904,14 +917,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, retry: err = fh_verify(rqstp, fhp, type, may_flags); if (!err) { - err = __nfsd_open(rqstp, fhp, type, may_flags, filp); - if (err == nfserr_stale && !retried) { + host_err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + if (host_err == -EOPENSTALE && !retried) { retried = true; fh_put(fhp); goto retry; } + err = nfserrno(host_err); } - validate_process_creds(); return err; } @@ -922,18 +935,13 @@ retry: * @may_flags: internal permission flags * @filp: OUT: open "struct file *" * - * Returns an nfsstat value in network byte order. + * Returns zero on success, or a negative errno value. */ -__be32 +int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, struct file **filp) { - __be32 err; - - validate_process_creds(); - err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); - validate_process_creds(); - return err; + return __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); } /* @@ -1172,8 +1180,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, host_err = vfs_iter_write(file, &iter, &pos, flags); file_end_write(file); if (host_err < 0) { - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, host_err); + commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; } *cnt = host_err; @@ -1185,10 +1192,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); - if (host_err < 0) { - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, host_err); - } + if (host_err < 0) + commit_reset_write_verifier(nn, rqstp, host_err); } out_nfserr: @@ -1331,8 +1336,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, err = nfserr_notsupp; break; default: - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, err2); + commit_reset_write_verifier(nn, rqstp, err2); err = nfserrno(err2); } } else @@ -1540,7 +1544,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dput(dchild); if (err) goto out_unlock; - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp); fh_fill_post_attrs(fhp); out_unlock: @@ -1635,13 +1641,16 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, inode_unlock(dentry->d_inode); goto out_drop_write; } - fh_fill_pre_attrs(fhp); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path); err = nfserrno(host_err); cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); if (!err) nfsd_create_setattr(rqstp, fhp, resfhp, attrs); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(dentry->d_inode); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1703,7 +1712,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (d_really_is_negative(dold)) goto out_dput; - fh_fill_pre_attrs(ffhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_dput; host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL); fh_fill_post_attrs(ffhp); inode_unlock(dirp); @@ -1781,6 +1792,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; + err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out; + if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) + goto out; + retry: host_err = fh_want_write(ffhp); if (host_err) { @@ -1789,8 +1806,12 @@ retry: } trap = lock_rename(tdentry, fdentry); - fh_fill_pre_attrs(ffhp); - fh_fill_pre_attrs(tfhp); + err = fh_fill_pre_attrs(ffhp); + if (err != nfs_ok) + goto out_unlock; + err = fh_fill_pre_attrs(tfhp); + if (err != nfs_ok) + goto out_unlock; odentry = lookup_one_len(fname, fdentry, flen); host_err = PTR_ERR(odentry); @@ -1812,12 +1833,6 @@ retry: if (ndentry == trap) goto out_dput_new; - host_err = -EXDEV; - if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) - goto out_dput_new; - if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) - goto out_dput_new; - if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && nfsd_has_cached_files(ndentry)) { close_cached = true; @@ -1857,6 +1872,7 @@ retry: fh_fill_post_attrs(ffhp); fh_fill_post_attrs(tfhp); } +out_unlock: unlock_rename(tdentry, fdentry); fh_drop_write(ffhp); @@ -1916,12 +1932,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, goto out_unlock; } rinode = d_inode(rdentry); - ihold(rinode); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ihold(rinode); if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - fh_fill_pre_attrs(fhp); if (type != S_IFDIR) { int retries; @@ -2341,16 +2359,18 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, NULL); - + err = nfsd_xattr_errno(ret); fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - return nfsd_xattr_errno(ret); + return err; } __be32 @@ -2368,15 +2388,17 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, if (ret) return nfserrno(ret); inode_lock(fhp->fh_dentry->d_inode); - fh_fill_pre_attrs(fhp); - - ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf, - len, flags, NULL); + err = fh_fill_pre_attrs(fhp); + if (err != nfs_ok) + goto out_unlock; + ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, + name, buf, len, flags, NULL); fh_fill_post_attrs(fhp); + err = nfsd_xattr_errno(ret); +out_unlock: inode_unlock(fhp->fh_dentry->d_inode); fh_drop_write(fhp); - - return nfsd_xattr_errno(ret); + return err; } #endif diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a6890ea7b765..e3c29596f4df 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -104,8 +104,8 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, - int, struct file **); +int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, + int may_flags, struct file **filp); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 510978e602da..80e859dc84d8 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -50,6 +50,134 @@ #define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f)) #define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f)) +/** + * nfsd4_encode_bool - Encode an XDR bool type result + * @xdr: target XDR stream + * @val: boolean value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_bool(struct xdr_stream *xdr, bool val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(p == NULL)) + return nfserr_resource; + *p = val ? xdr_one : xdr_zero; + return nfs_ok; +} + +/** + * nfsd4_encode_uint32_t - Encode an XDR uint32_t type result + * @xdr: target XDR stream + * @val: integer value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_uint32_t(struct xdr_stream *xdr, u32 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(p == NULL)) + return nfserr_resource; + *p = cpu_to_be32(val); + return nfs_ok; +} + +#define nfsd4_encode_aceflag4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_acemask4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_acetype4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_count4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_mode4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_nfs_lease4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_qop4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_sequenceid4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_slotid4(x, v) nfsd4_encode_uint32_t(x, v) + +/** + * nfsd4_encode_uint64_t - Encode an XDR uint64_t type result + * @xdr: target XDR stream + * @val: integer value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2); + + if (unlikely(p == NULL)) + return nfserr_resource; + put_unaligned_be64(val, p); + return nfs_ok; +} + +#define nfsd4_encode_changeid4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_nfs_cookie4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_length4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_offset4(x, v) nfsd4_encode_uint64_t(x, v) + +/** + * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result + * @xdr: target XDR stream + * @data: pointer to data + * @size: length of data in bytes + * + * Return values: + * %nfs_ok: @data encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_opaque_fixed(struct xdr_stream *xdr, const void *data, + size_t size) +{ + __be32 *p = xdr_reserve_space(xdr, xdr_align_size(size)); + size_t pad = xdr_pad_size(size); + + if (unlikely(p == NULL)) + return nfserr_resource; + memcpy(p, data, size); + if (pad) + memset((char *)p + size, 0, pad); + return nfs_ok; +} + +/** + * nfsd4_encode_opaque - Encode a variable-length XDR opaque type result + * @xdr: target XDR stream + * @data: pointer to data + * @size: length of data in bytes + * + * Return values: + * %nfs_ok: @data encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_opaque(struct xdr_stream *xdr, const void *data, size_t size) +{ + size_t pad = xdr_pad_size(size); + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(size)); + if (unlikely(p == NULL)) + return nfserr_resource; + *p++ = cpu_to_be32(size); + memcpy(p, data, size); + if (pad) + memset((char *)p + size, 0, pad); + return nfs_ok; +} + +#define nfsd4_encode_component4(x, d, s) nfsd4_encode_opaque(x, d, s) + struct nfsd4_compound_state { struct svc_fh current_fh; struct svc_fh save_fh; @@ -170,12 +298,8 @@ struct nfsd4_lock { } v; /* response */ - union { - struct { - stateid_t stateid; - } ok; - struct nfsd4_lock_denied denied; - } u; + stateid_t lk_resp_stateid; + struct nfsd4_lock_denied lk_denied; }; #define lk_new_open_seqid v.new.open_seqid #define lk_new_open_stateid v.new.open_stateid @@ -185,20 +309,15 @@ struct nfsd4_lock { #define lk_old_lock_stateid v.old.lock_stateid #define lk_old_lock_seqid v.old.lock_seqid -#define lk_resp_stateid u.ok.stateid -#define lk_denied u.denied - - struct nfsd4_lockt { u32 lt_type; clientid_t lt_clientid; struct xdr_netobj lt_owner; u64 lt_offset; u64 lt_length; - struct nfsd4_lock_denied lt_denied; + struct nfsd4_lock_denied lt_denied; }; - struct nfsd4_locku { u32 lu_type; u32 lu_seqid; @@ -267,9 +386,9 @@ struct nfsd4_open { u32 op_deleg_want; /* request */ stateid_t op_stateid; /* response */ __be32 op_xdr_error; /* see nfsd4_open_omfg() */ - u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ + bool op_recall; /* response */ bool op_truncate; /* used during processing */ bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ @@ -496,7 +615,7 @@ struct nfsd4_layoutcommit { u32 lc_layout_type; /* request */ u32 lc_up_len; /* layout length */ void *lc_up_layout; /* decoded by callback */ - u32 lc_size_chg; /* boolean for response */ + bool lc_size_chg; /* response */ u64 lc_newsize; /* response */ }; @@ -508,7 +627,7 @@ struct nfsd4_layoutreturn { u32 lrf_body_len; /* request */ void *lrf_body; /* request */ stateid_t lr_sid; /* request/response */ - u32 lrs_present; /* response */ + bool lrs_present; /* response */ }; struct nfsd4_fallocate { @@ -626,8 +745,7 @@ struct nfsd4_copy_notify { /* response */ stateid_t cpn_cnr_stateid; - u64 cpn_sec; - u32 cpn_nsec; + struct timespec64 cpn_lease_time; struct nl4_server *cpn_src; }; @@ -774,17 +892,6 @@ void warn_on_nonidempotent_op(struct nfsd4_op *op); #define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) -static inline void -set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) -{ - BUG_ON(!fhp->fh_pre_saved); - cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr); - - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; -} - - bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr); bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr); @@ -831,8 +938,10 @@ extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_lock_release(union nfsd4_op_u *u); extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_lockt_release(union nfsd4_op_u *u); extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 7d59567465e1..7dae168e346e 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NILFS2_FS tristate "NILFS2 file system support" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 6ce8617b562d..7342de296ec3 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -205,7 +205,8 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, int ret; spin_lock(lock); - if (prev->bh && blkoff == prev->blkoff) { + if (prev->bh && blkoff == prev->blkoff && + likely(buffer_uptodate(prev->bh))) { get_bh(prev->bh); *bhp = prev->bh; spin_unlock(lock); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index decd6471300b..de2073c47651 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } /* @@ -519,7 +519,7 @@ got_it: de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, page->mapping, from, to); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: @@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); out: nilfs_put_page(page); return err; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a9eb3487efb2..740ce26d1e76 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -108,7 +108,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); - return block_page_mkwrite_return(ret); + return vmf_fs_error(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 48fe71d309cb..8beb2730929d 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, struct the_nilfs *nilfs = inode->i_sb->s_fs_info; err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); - if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ - brelse(bh); + if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */ goto failed; - } } lock_buffer(bh); @@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, failed: unlock_page(bh->b_page); put_page(bh->b_page); + if (unlikely(err)) + brelse(bh); return err; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 35bc79305318..f861f3a0bf5c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) atomic64_inc(&root->inodes_count); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { err = nilfs_bmap_read(ii->i_bmap, NULL); @@ -449,12 +449,12 @@ int nilfs_read_inode_common(struct inode *inode, i_gid_write(inode, le32_to_cpu(raw_inode->i_gid)); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode_set_atime(inode, le64_to_cpu(raw_inode->i_mtime), + le32_to_cpu(raw_inode->i_mtime_nsec)); + inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime), + le32_to_cpu(raw_inode->i_ctime_nsec)); + inode_set_mtime(inode, le64_to_cpu(raw_inode->i_mtime), + le32_to_cpu(raw_inode->i_mtime_nsec)); if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) @@ -768,10 +768,10 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le64(inode->i_size); - raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + raw_inode->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + raw_inode->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + raw_inode->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); raw_inode->i_flags = cpu_to_le32(ii->i_flags); @@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode) nilfs_truncate_bmap(ii, blkoff); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); @@ -1025,7 +1025,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) int err; spin_lock(&nilfs->ns_inode_lock); - if (ii->i_bh == NULL) { + if (ii->i_bh == NULL || unlikely(!buffer_uptodate(ii->i_bh))) { spin_unlock(&nilfs->ns_inode_lock); err = nilfs_ifile_get_inode_block(ii->i_root->ifile, inode->i_ino, pbh); @@ -1034,7 +1034,10 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) spin_lock(&nilfs->ns_inode_lock); if (ii->i_bh == NULL) ii->i_bh = *pbh; - else { + else if (unlikely(!buffer_uptodate(ii->i_bh))) { + __brelse(ii->i_bh); + ii->i_bh = *pbh; + } else { brelse(*pbh); *pbh = ii->i_bh; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 1dfbc0c34513..40ffade49f38 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -149,7 +149,7 @@ int nilfs_fileattr_set(struct mnt_idmap *idmap, NILFS_I(inode)->i_flags = oldflags | (flags & FS_FL_USER_MODIFIABLE); nilfs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 19c8158605ed..c97c77a39668 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -356,30 +356,28 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) */ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) { - pgoff_t index = (pgoff_t)block >> - (PAGE_SHIFT - inode->i_blkbits); - struct page *page; - unsigned long first_block; + pgoff_t index = block >> (PAGE_SHIFT - inode->i_blkbits); + struct folio *folio; + struct buffer_head *bh; int ret = 0; int still_dirty; - page = find_lock_page(inode->i_mapping, index); - if (!page) + folio = filemap_lock_folio(inode->i_mapping, index); + if (IS_ERR(folio)) return -ENOENT; - wait_on_page_writeback(page); - - first_block = (unsigned long)index << - (PAGE_SHIFT - inode->i_blkbits); - if (page_has_buffers(page)) { - struct buffer_head *bh; + folio_wait_writeback(folio); - bh = nilfs_page_get_nth_block(page, block - first_block); + bh = folio_buffers(folio); + if (bh) { + unsigned long first_block = index << + (PAGE_SHIFT - inode->i_blkbits); + bh = get_nth_bh(bh, block - first_block); nilfs_forget_buffer(bh); } - still_dirty = PageDirty(page); - unlock_page(page); - put_page(page); + still_dirty = folio_test_dirty(folio); + folio_unlock(folio); + folio_put(folio); if (still_dirty || invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) @@ -560,17 +558,19 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen; - struct page *page; + struct folio *folio; int blkbits = inode->i_blkbits; - page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index); - if (!page) - return -ENOMEM; + folio = filemap_grab_folio(shadow->inode->i_mapping, + bh->b_folio->index); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << blkbits, 0); + bh_frozen = folio_buffers(folio); + if (!bh_frozen) + bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0); - bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); + bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits); if (!buffer_uptodate(bh_frozen)) nilfs_copy_buffer(bh_frozen, bh); @@ -582,8 +582,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) brelse(bh_frozen); /* already frozen */ } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } @@ -592,17 +592,19 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen = NULL; - struct page *page; + struct folio *folio; int n; - page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index); - if (page) { - if (page_has_buffers(page)) { + folio = filemap_lock_folio(shadow->inode->i_mapping, + bh->b_folio->index); + if (!IS_ERR(folio)) { + bh_frozen = folio_buffers(folio); + if (bh_frozen) { n = bh_offset(bh) >> inode->i_blkbits; - bh_frozen = nilfs_page_get_nth_block(page, n); + bh_frozen = get_nth_bh(bh_frozen, n); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return bh_frozen; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c7024da8f1e2..2a4e7f4a8102 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -185,7 +185,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, if (err) return err; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); drop_nlink(inode); err = 0; out: @@ -387,7 +387,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, goto out_dir; nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); drop_nlink(new_inode); @@ -406,7 +406,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); nilfs_delete_entry(old_de, old_page); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b4e54d079b7d..06b04758f289 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -25,19 +25,19 @@ (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \ BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked)) -static struct buffer_head * -__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, - int blkbits, unsigned long b_state) +static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, + unsigned long block, pgoff_t index, int blkbits, + unsigned long b_state) { unsigned long first_block; - struct buffer_head *bh; + struct buffer_head *bh = folio_buffers(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << blkbits, b_state); + if (!bh) + bh = create_empty_buffers(folio, 1 << blkbits, b_state); first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); - bh = nilfs_page_get_nth_block(page, block - first_block); + bh = get_nth_bh(bh, block - first_block); touch_buffer(bh); wait_on_buffer(bh); @@ -51,17 +51,17 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, { int blkbits = inode->i_blkbits; pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits); - struct page *page; + struct folio *folio; struct buffer_head *bh; - page = grab_cache_page(mapping, index); - if (unlikely(!page)) + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) return NULL; - bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + bh = __nilfs_get_folio_block(folio, blkoff, index, blkbits, b_state); if (unlikely(!bh)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return NULL; } return bh; @@ -184,30 +184,32 @@ void nilfs_page_bug(struct page *page) } /** - * nilfs_copy_page -- copy the page with buffers - * @dst: destination page - * @src: source page - * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * nilfs_copy_folio -- copy the folio with buffers + * @dst: destination folio + * @src: source folio + * @copy_dirty: flag whether to copy dirty states on the folio's buffer heads. * - * This function is for both data pages and btnode pages. The dirty flag - * should be treated by caller. The page must not be under i/o. - * Both src and dst page must be locked + * This function is for both data folios and btnode folios. The dirty flag + * should be treated by caller. The folio must not be under i/o. + * Both src and dst folio must be locked */ -static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +static void nilfs_copy_folio(struct folio *dst, struct folio *src, + bool copy_dirty) { struct buffer_head *dbh, *dbufs, *sbh; unsigned long mask = NILFS_BUFFER_INHERENT_BITS; - BUG_ON(PageWriteback(dst)); + BUG_ON(folio_test_writeback(dst)); - sbh = page_buffers(src); - if (!page_has_buffers(dst)) - create_empty_buffers(dst, sbh->b_size, 0); + sbh = folio_buffers(src); + dbh = folio_buffers(dst); + if (!dbh) + dbh = create_empty_buffers(dst, sbh->b_size, 0); if (copy_dirty) mask |= BIT(BH_Dirty); - dbh = dbufs = page_buffers(dst); + dbufs = dbh; do { lock_buffer(sbh); lock_buffer(dbh); @@ -218,16 +220,16 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) dbh = dbh->b_this_page; } while (dbh != dbufs); - copy_highpage(dst, src); + folio_copy(dst, src); - if (PageUptodate(src) && !PageUptodate(dst)) - SetPageUptodate(dst); - else if (!PageUptodate(src) && PageUptodate(dst)) - ClearPageUptodate(dst); - if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) - SetPageMappedToDisk(dst); - else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) - ClearPageMappedToDisk(dst); + if (folio_test_uptodate(src) && !folio_test_uptodate(dst)) + folio_mark_uptodate(dst); + else if (!folio_test_uptodate(src) && folio_test_uptodate(dst)) + folio_clear_uptodate(dst); + if (folio_test_mappedtodisk(src) && !folio_test_mappedtodisk(dst)) + folio_set_mappedtodisk(dst); + else if (!folio_test_mappedtodisk(src) && folio_test_mappedtodisk(dst)) + folio_clear_mappedtodisk(dst); do { unlock_buffer(sbh); @@ -269,7 +271,7 @@ repeat: NILFS_PAGE_BUG(&folio->page, "found empty page in dat page cache"); - nilfs_copy_page(&dfolio->page, &folio->page, 1); + nilfs_copy_folio(dfolio, folio, true); filemap_dirty_folio(folio_mapping(dfolio), dfolio); folio_unlock(dfolio); @@ -314,7 +316,7 @@ repeat: if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); - nilfs_copy_page(&dfolio->page, &folio->page, 0); + nilfs_copy_folio(dfolio, folio, false); folio_unlock(dfolio); folio_put(dfolio); /* Do we not need to remove folio from smap here? */ diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 21ddcdd4d63e..d249ea1cefff 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -52,15 +52,4 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, #define NILFS_PAGE_BUG(page, m, a...) \ do { nilfs_page_bug(page); BUG(); } while (0) -static inline struct buffer_head * -nilfs_page_get_nth_block(struct page *page, unsigned int count) -{ - struct buffer_head *bh = page_buffers(page); - - while (count-- > 0) - bh = bh->b_this_page; - get_bh(bh); - return bh; -} - #endif /* _NILFS_PAGE_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 7ec16879756e..55e31cc903d1 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -731,10 +731,9 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, continue; } head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, i_blocksize(inode), 0); - head = folio_buffers(folio); - } + if (!head) + head = create_empty_buffers(folio, + i_blocksize(inode), 0); folio_unlock(folio); bh = head; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 2c6078a6b8ec..58ca7c936393 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -501,15 +501,38 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) down_write(&NILFS_MDT(sufile)->mi_sem); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); - if (!ret) { - mark_buffer_dirty(bh); - nilfs_mdt_mark_dirty(sufile); - kaddr = kmap_atomic(bh->b_page); - su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (ret) + goto out_sem; + + kaddr = kmap_atomic(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (unlikely(nilfs_segment_usage_error(su))) { + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + + kunmap_atomic(kaddr); + brelse(bh); + if (nilfs_segment_is_active(nilfs, segnum)) { + nilfs_error(sufile->i_sb, + "active segment %llu is erroneous", + (unsigned long long)segnum); + } else { + /* + * Segments marked erroneous are never allocated by + * nilfs_sufile_alloc(); only active segments, ie, + * the segments indexed by ns_segnum or ns_nextnum, + * can be erroneous here. + */ + WARN_ON_ONCE(1); + } + ret = -EIO; + } else { nilfs_segment_usage_set_dirty(su); kunmap_atomic(kaddr); + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(sufile); brelse(bh); } +out_sem: up_write(&NILFS_MDT(sufile)->mi_sem); return ret; } @@ -536,9 +559,14 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); - WARN_ON(nilfs_segment_usage_error(su)); - if (modtime) + if (modtime) { + /* + * Check segusage error and set su_lastmod only when updating + * this entry with a valid timestamp, not for cancellation. + */ + WARN_ON_ONCE(nilfs_segment_usage_error(su)); su->su_lastmod = cpu_to_le64(modtime); + } su->su_nblocks = cpu_to_le32(nblocks); kunmap_atomic(kaddr); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 0ef8c71bde8e..a5d1fa4e7552 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -35,6 +35,7 @@ #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include "nilfs.h" #include "export.h" #include "mdt.h" @@ -1216,7 +1217,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) } struct nilfs_super_data { - struct block_device *bdev; __u64 cno; int flags; }; @@ -1283,64 +1283,49 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd) static int nilfs_set_bdev_super(struct super_block *s, void *data) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; + s->s_dev = *(dev_t *)data; return 0; } static int nilfs_test_bdev_super(struct super_block *s, void *data) { - return (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } static struct dentry * nilfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct nilfs_super_data sd; + struct nilfs_super_data sd = { .flags = flags }; struct super_block *s; - struct dentry *root_dentry; - int err, s_new = false; + dev_t dev; + int err; - sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, - NULL); - if (IS_ERR(sd.bdev)) - return ERR_CAST(sd.bdev); + if (nilfs_identify(data, &sd)) + return ERR_PTR(-EINVAL); - sd.cno = 0; - sd.flags = flags; - if (nilfs_identify((char *)data, &sd)) { - err = -EINVAL; - goto failed; - } + err = lookup_bdev(dev_name, &dev); + if (err) + return ERR_PTR(err); - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&sd.bdev->bd_fsfreeze_mutex); - if (sd.bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); - err = -EBUSY; - goto failed; - } s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, - sd.bdev); - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) { - err = PTR_ERR(s); - goto failed; - } + &dev); + if (IS_ERR(s)) + return ERR_CAST(s); if (!s->s_root) { - s_new = true; - - /* New superblock instance created */ - snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); - sb_set_blocksize(s, block_size(sd.bdev)); - - err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); + /* + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * __invalidate_device()). It is safe because we have active sb + * reference and SB_BORN is not set yet. + */ + up_write(&s->s_umount); + err = setup_bdev_super(s, flags, NULL); + down_write(&s->s_umount); + if (!err) + err = nilfs_fill_super(s, data, + flags & SB_SILENT ? 1 : 0); if (err) goto failed_super; @@ -1366,24 +1351,18 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (sd.cno) { + struct dentry *root_dentry; + err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); if (err) goto failed_super; - } else { - root_dentry = dget(s->s_root); + return root_dentry; } - if (!s_new) - blkdev_put(sd.bdev, fs_type); - - return root_dentry; + return dget(s->s_root); failed_super: deactivate_locked_super(s); - - failed: - if (!s_new) - blkdev_put(sd.bdev, fs_type); return ERR_PTR(err); } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 0f0667957c81..71400496ed36 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -716,7 +716,11 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) goto failed_sbh; } nilfs_release_super_block(nilfs); - sb_set_blocksize(sb, blocksize); + if (!sb_set_blocksize(sb, blocksize)) { + nilfs_err(sb, "bad blocksize %d", blocksize); + err = -EINVAL; + goto out; + } err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); if (err) diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig index c7857e36adbb..2a601af6f3bd 100644 --- a/fs/nls/Kconfig +++ b/fs/nls/Kconfig @@ -617,4 +617,7 @@ config NLS_UTF8 input/output character sets. Say Y here for the UTF-8 encoding of the Unicode/ISO9646 universal character set. +config NLS_UCS2_UTILS + tristate + endif # NLS diff --git a/fs/nls/Makefile b/fs/nls/Makefile index ac54db297128..5062c699d041 100644 --- a/fs/nls/Makefile +++ b/fs/nls/Makefile @@ -54,3 +54,4 @@ obj-$(CONFIG_NLS_MAC_INUIT) += mac-inuit.o obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o +obj-$(CONFIG_NLS_UCS2_UTILS) += nls_ucs2_utils.o diff --git a/fs/nls/nls_ucs2_data.h b/fs/nls/nls_ucs2_data.h new file mode 100644 index 000000000000..1f454dc0f4e0 --- /dev/null +++ b/fs/nls/nls_ucs2_data.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NLS_UCS2_DATA_H +#define _NLS_UCS2_DATA_H + +struct UniCaseRange { + wchar_t start; + wchar_t end; + signed char *table; +}; + +extern signed char NlsUniUpperTable[512]; +extern const struct UniCaseRange NlsUniUpperRange[]; + +#endif /* _NLS_UCS2_DATA_H */ diff --git a/fs/smb/server/uniupr.h b/fs/nls/nls_ucs2_utils.c index 26583b776897..a69781c54dd8 100644 --- a/fs/smb/server/uniupr.h +++ b/fs/nls/nls_ucs2_utils.c @@ -1,19 +1,27 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Some of the source code in this file came from fs/cifs/uniupr.h * Copyright (c) International Business Machines Corp., 2000,2002 * - * uniupr.h - Unicode compressed case ranges + * Some of the source code in this file came from fs/cifs/cifs_unicode.c + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * Modified by Steve French (sfrench@us.ibm.com) + * Modified by Namjae Jeon (linkinjeon@kernel.org) * */ -#ifndef __KSMBD_UNIUPR_H -#define __KSMBD_UNIUPR_H +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <asm/unaligned.h> +#include "nls_ucs2_utils.h" + +MODULE_LICENSE("GPL"); -#ifndef UNIUPR_NOUPPER /* * Latin upper case */ -signed char SmbUniUpperTable[512] = { +signed char NlsUniUpperTable[512] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ @@ -51,6 +59,7 @@ signed char SmbUniUpperTable[512] = { 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ }; +EXPORT_SYMBOL_GPL(NlsUniUpperTable); /* Upper case range - Greek */ static signed char UniCaseRangeU03a0[47] = { @@ -126,7 +135,7 @@ static signed char UniCaseRangeUff40[27] = { /* * Upper Case Range */ -const struct UniCaseRange SmbUniUpperRange[] = { +const struct UniCaseRange NlsUniUpperRange[] = { {0x03a0, 0x03ce, UniCaseRangeU03a0}, {0x0430, 0x045f, UniCaseRangeU0430}, {0x0490, 0x04cc, UniCaseRangeU0490}, @@ -134,135 +143,4 @@ const struct UniCaseRange SmbUniUpperRange[] = { {0xff40, 0xff5a, UniCaseRangeUff40}, {0} }; -#endif - -#ifndef UNIUPR_NOLOWER -/* - * Latin lower case - */ -signed char CifsUniLowerTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 040-04f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, - 0, 0, 0, /* 050-05f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, /* 0c0-0cf */ - 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, - 32, 32, 32, 0, /* 0d0-0df */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */ - 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */ - 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, - 0, /* 170-17f */ - 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, - 0, /* 180-18f */ - 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */ - 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */ - 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */ -}; - -/* Lower case range - Greek */ -static signed char UniCaseRangeL0380[44] = { - 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 390-39f */ - 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* Lower case range - Cyrillic */ -static signed char UniCaseRangeL0400[48] = { - 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, - 0, 80, 80, /* 400-40f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 410-41f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, /* 420-42f */ -}; - -/* Lower case range - Extended cyrillic */ -static signed char UniCaseRangeL0490[60] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */ - 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, -}; - -/* Lower case range - Extended latin and greek */ -static signed char UniCaseRangeL1e00[504] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */ - 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, - 0, 0, /* 1fc0-1fcf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, - 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Lower case range - Wide latin */ -static signed char UniCaseRangeLff20[27] = { - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, - 32, /* ff20-ff2f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* - * Lower Case Range - */ -const struct UniCaseRange CifsUniLowerRange[] = { - {0x0380, 0x03ab, UniCaseRangeL0380}, - {0x0400, 0x042f, UniCaseRangeL0400}, - {0x0490, 0x04cb, UniCaseRangeL0490}, - {0x1e00, 0x1ff7, UniCaseRangeL1e00}, - {0xff20, 0xff3a, UniCaseRangeLff20}, - {0} -}; -#endif - -#endif /* __KSMBD_UNIUPR_H */ +EXPORT_SYMBOL_GPL(NlsUniUpperRange); diff --git a/fs/nls/nls_ucs2_utils.h b/fs/nls/nls_ucs2_utils.h new file mode 100644 index 000000000000..ef18d30db1d0 --- /dev/null +++ b/fs/nls/nls_ucs2_utils.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Some of the source code in this file came from fs/cifs/cifs_unicode.c + * and then via server/unicode.c + * cifs_unicode: Unicode kernel case support + * + * Function: + * Convert a unicode character to upper or lower case using + * compressed tables. + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * + * + * Notes: + * These APIs are based on the C library functions. The semantics + * should match the C functions but with expanded size operands. + * + * The upper/lower functions are based on a table created by mkupr. + * This is a compressed table of upper and lower case conversion. + * + */ +#ifndef _NLS_UCS2_UTILS_H +#define _NLS_UCS2_UTILS_H + +#include <asm/byteorder.h> +#include <linux/types.h> +#include <linux/nls.h> +#include <linux/unicode.h> +#include "nls_ucs2_data.h" + +/* + * Windows maps these to the user defined 16 bit Unicode range since they are + * reserved symbols (along with \ and /), otherwise illegal to store + * in filenames in NTFS + */ +#define UNI_ASTERISK ((__u16)('*' + 0xF000)) +#define UNI_QUESTION ((__u16)('?' + 0xF000)) +#define UNI_COLON ((__u16)(':' + 0xF000)) +#define UNI_GRTRTHAN ((__u16)('>' + 0xF000)) +#define UNI_LESSTHAN ((__u16)('<' + 0xF000)) +#define UNI_PIPE ((__u16)('|' + 0xF000)) +#define UNI_SLASH ((__u16)('\\' + 0xF000)) + +/* + * UniStrcat: Concatenate the second string to the first + * + * Returns: + * Address of the first string + */ +static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ + + while (*ucs1++) + /*NULL*/; /* To end of first string */ + ucs1--; /* Return to the null */ + while ((*ucs1++ = *ucs2++)) + /*NULL*/; /* copy string 2 over */ + return anchor; +} + +/* + * UniStrchr: Find a character in a string + * + * Returns: + * Address of first occurrence of character in string + * or NULL if the character is not in the string + */ +static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc) +{ + while ((*ucs != uc) && *ucs) + ucs++; + + if (*ucs == uc) + return (wchar_t *)ucs; + return NULL; +} + +/* + * UniStrcmp: Compare two strings + * + * Returns: + * < 0: First string is less than second + * = 0: Strings are equal + * > 0: First string is greater than second + */ +static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) +{ + while ((*ucs1 == *ucs2) && *ucs1) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)*ucs2; +} + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)) + /*NULL*/; + return anchor; +} + +/* + * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) + */ +static inline size_t UniStrlen(const wchar_t *ucs1) +{ + int i = 0; + + while (*ucs1++) + i++; + return i; +} + +/* + * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a + * string (length limited) + */ +static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen) +{ + int i = 0; + + while (*ucs1++) { + i++; + if (i >= maxlen) + break; + } + return i; +} + +/* + * UniStrncat: Concatenate length limited string + */ +static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; /* save pointer to string 1 */ + + while (*ucs1++) + /*NULL*/; + ucs1--; /* point to null terminator of s1 */ + while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ + ucs1++; + ucs2++; + } + *ucs1 = 0; /* Null terminate the result */ + return anchor; +} + +/* + * UniStrncmp: Compare length limited string + */ +static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == *ucs2) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)*ucs2; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int +UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)__le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_le: Copy length limited string with pad to little-endian + */ +static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrstr: Find a string in a string + * + * Returns: + * Address of first match found + * NULL if no matching string is found + */ +static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) +{ + const wchar_t *anchor1 = ucs1; + const wchar_t *anchor2 = ucs2; + + while (*ucs1) { + if (*ucs1 == *ucs2) { + /* Partial match found */ + ucs1++; + ucs2++; + } else { + if (!*ucs2) /* Match found */ + return (wchar_t *)anchor1; + ucs1 = ++anchor1; /* No match */ + ucs2 = anchor2; + } + } + + if (!*ucs2) /* Both end together */ + return (wchar_t *)anchor1; /* Match found */ + return NULL; /* No match */ +} + +#ifndef UNIUPR_NOUPPER +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t UniToupper(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(NlsUniUpperTable)) { + /* Latin characters */ + return uc + NlsUniUpperTable[uc]; /* Use base tables */ + } + + rp = NlsUniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + return uc; /* Past last range */ +} + +/* + * UniStrupr: Upper case a unicode string + */ +static inline __le16 *UniStrupr(register __le16 *upin) +{ + register __le16 *up; + + up = upin; + while (*up) { /* For all characters */ + *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); + up++; + } + return upin; /* Return input pointer */ +} +#endif /* UNIUPR_NOUPPER */ + +#endif /* _NLS_UCS2_UTILS_H */ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 190aa717fa32..1cb9ad7e884e 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -39,9 +39,9 @@ static void __init dnotify_sysctl_init(void) #define dnotify_sysctl_init() do { } while (0) #endif -static struct kmem_cache *dnotify_struct_cache __read_mostly; -static struct kmem_cache *dnotify_mark_cache __read_mostly; -static struct fsnotify_group *dnotify_group __read_mostly; +static struct kmem_cache *dnotify_struct_cache __ro_after_init; +static struct kmem_cache *dnotify_mark_cache __ro_after_init; +static struct fsnotify_group *dnotify_group __ro_after_init; /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which @@ -199,7 +199,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) } /* this conversion is done only at watch creation */ -static __u32 convert_arg(unsigned long arg) +static __u32 convert_arg(unsigned int arg) { __u32 new_mask = FS_EVENT_ON_CHILD; @@ -258,14 +258,14 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be * attached to the fsnotify_mark. */ -int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { struct dnotify_mark *new_dn_mark, *dn_mark; struct fsnotify_mark *new_fsn_mark, *fsn_mark; struct dnotify_struct *dn; struct inode *inode; fl_owner_t id = current->files; - struct file *f; + struct file *f = NULL; int destroy = 0, error = 0; __u32 mask; @@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) } rcu_read_lock(); - f = lookup_fd_rcu(fd); + f = lookup_fdget_rcu(fd); rcu_read_unlock(); /* if (f != filp) means that we lost a race and another task/thread @@ -392,6 +392,8 @@ out_err: fsnotify_put_mark(new_fsn_mark); if (dn) kmem_cache_free(dnotify_struct_cache, dn); + if (f) + fput(f); return error; } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index e8a3c28c5d12..6936671e148d 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -275,9 +275,9 @@ static inline void fanotify_init_event(struct fanotify_event *event, #define FANOTIFY_INLINE_FH(name, size) \ struct { \ - struct fanotify_fh (name); \ + struct fanotify_fh name; \ /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \ - unsigned char _inline_fh_buf[(size)]; \ + unsigned char _inline_fh_buf[size]; \ } struct fanotify_fid_event { diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f69c451018e3..4d765c72496f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -112,10 +112,10 @@ static void __init fanotify_sysctls_init(void) extern const struct fsnotify_ops fanotify_fsnotify_ops; -struct kmem_cache *fanotify_mark_cache __read_mostly; -struct kmem_cache *fanotify_fid_event_cachep __read_mostly; -struct kmem_cache *fanotify_path_event_cachep __read_mostly; -struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +struct kmem_cache *fanotify_mark_cache __ro_after_init; +struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; +struct kmem_cache *fanotify_path_event_cachep __ro_after_init; +struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ @@ -1585,16 +1585,25 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) } /* Check if filesystem can encode a unique fid */ -static int fanotify_test_fid(struct dentry *dentry) +static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + const struct export_operations *nop = dentry->d_sb->s_export_op; + + /* + * We need to make sure that the filesystem supports encoding of + * file handles so user can use name_to_handle_at() to compare fids + * reported with events to the file handle of watched objects. + */ + if (!exportfs_can_encode_fid(nop)) + return -EOPNOTSUPP; + /* - * We need to make sure that the file system supports at least - * encoding a file handle so user can use name_to_handle_at() to - * compare fid returned with event to the file handle of watched - * objects. However, even the relaxed AT_HANDLE_FID flag requires - * at least empty export_operations for ecoding unique file ids. + * For sb/mount mark, we also need to make sure that the filesystem + * supports decoding file handles, so user has a way to map back the + * reported fids to filesystem objects. */ - if (!dentry->d_sb->s_export_op) + if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) return -EOPNOTSUPP; return 0; @@ -1812,7 +1821,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto path_put_and_out; - ret = fanotify_test_fid(path.dentry); + ret = fanotify_test_fid(path.dentry, flags); if (ret) goto path_put_and_out; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1c4bfdab008d..a3809ae92170 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -49,7 +49,7 @@ /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *inotify_inode_mark_cachep __ro_after_init; #ifdef CONFIG_SYSCTL diff --git a/fs/nsfs.c b/fs/nsfs.c index f602a96a1afe..9a4b228d42fa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -84,7 +84,7 @@ slow: return -ENOMEM; } inode->i_ino = ns->inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index f93e69a61283..7b2509741735 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS_FS tristate "NTFS file system support" + select BUFFER_HEAD select NLS help NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 4e158bce4192..71e31e789b29 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -145,13 +145,12 @@ still_busy: } /** - * ntfs_read_block - fill a @page of an address space with data - * @page: page cache page to fill with data + * ntfs_read_block - fill a @folio of an address space with data + * @folio: page cache folio to fill with data * - * Fill the page @page of the address space belonging to the @page->host inode. * We read each buffer asynchronously and when all buffers are read in, our io * completion handler ntfs_end_buffer_read_async(), if required, automatically - * applies the mst fixups to the page before finally marking it uptodate and + * applies the mst fixups to the folio before finally marking it uptodate and * unlocking it. * * We only enforce allocated_size limit because i_size is checked for in @@ -161,7 +160,7 @@ still_busy: * * Contains an adapted version of fs/buffer.c::block_read_full_folio(). */ -static int ntfs_read_block(struct page *page) +static int ntfs_read_block(struct folio *folio) { loff_t i_size; VCN vcn; @@ -178,7 +177,7 @@ static int ntfs_read_block(struct page *page) int i, nr; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = folio->mapping->host; ni = NTFS_I(vi); vol = ni->vol; @@ -188,15 +187,10 @@ static int ntfs_read_block(struct page *page) blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, 0); - if (unlikely(!page_has_buffers(page))) { - unlock_page(page); - return -ENOMEM; - } - } - bh = head = page_buffers(page); - BUG_ON(!bh); + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); + bh = head; /* * We may be racing with truncate. To avoid some of the problems we @@ -205,11 +199,11 @@ static int ntfs_read_block(struct page *page) * may leave some buffers unmapped which are now allocated. This is * not a problem since these buffers will just get mapped when a write * occurs. In case of a shrinking truncate, we will detect this later - * on due to the runlist being incomplete and if the page is being + * on due to the runlist being incomplete and if the folio is being * fully truncated, truncate will throw it away as soon as we unlock * it so no need to worry what we do with it. */ - iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; init_size = ni->initialized_size; @@ -221,7 +215,7 @@ static int ntfs_read_block(struct page *page) } zblock = (init_size + blocksize - 1) >> blocksize_bits; - /* Loop through all the buffers in the page. */ + /* Loop through all the buffers in the folio. */ rl = NULL; nr = i = 0; do { @@ -299,7 +293,7 @@ lock_retry_remap: if (!err) err = -EIO; bh->b_blocknr = -1; - SetPageError(page); + folio_set_error(folio); ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " "attribute type 0x%x, vcn 0x%llx, " "offset 0x%x because its location on " @@ -312,13 +306,13 @@ lock_retry_remap: /* * Either iblock was outside lblock limits or * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion - * of the page and set the buffer uptodate. + * of the folio and set the buffer uptodate. */ handle_hole: bh->b_blocknr = -1UL; clear_buffer_mapped(bh); handle_zblock: - zero_user(page, i * blocksize, blocksize); + folio_zero_range(folio, i * blocksize, blocksize); if (likely(!err)) set_buffer_uptodate(bh); } while (i++, iblock++, (bh = bh->b_this_page) != head); @@ -349,11 +343,11 @@ handle_zblock: return 0; } /* No i/o was scheduled on any of the buffers. */ - if (likely(!PageError(page))) - SetPageUptodate(page); + if (likely(!folio_test_error(folio))) + folio_mark_uptodate(folio); else /* Signal synchronous i/o error. */ nr = -EIO; - unlock_page(page); + folio_unlock(folio); return nr; } @@ -433,7 +427,7 @@ retry_readpage: /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* Normal, non-resident data stream. */ - return ntfs_read_block(page); + return ntfs_read_block(folio); } /* * Attribute is resident, implying it is not compressed or encrypted. @@ -507,28 +501,29 @@ err_out: #ifdef NTFS_RW /** - * ntfs_write_block - write a @page to the backing store - * @page: page cache page to write out + * ntfs_write_block - write a @folio to the backing store + * @folio: page cache folio to write out * @wbc: writeback control structure * - * This function is for writing pages belonging to non-resident, non-mst + * This function is for writing folios belonging to non-resident, non-mst * protected attributes to their backing store. * - * For a page with buffers, map and write the dirty buffers asynchronously - * under page writeback. For a page without buffers, create buffers for the - * page, then proceed as above. + * For a folio with buffers, map and write the dirty buffers asynchronously + * under folio writeback. For a folio without buffers, create buffers for the + * folio, then proceed as above. * - * If a page doesn't have buffers the page dirty state is definitive. If a page - * does have buffers, the page dirty state is just a hint, and the buffer dirty - * state is definitive. (A hint which has rules: dirty buffers against a clean - * page is illegal. Other combinations are legal and need to be handled. In - * particular a dirty page containing clean buffers for example.) + * If a folio doesn't have buffers the folio dirty state is definitive. If + * a folio does have buffers, the folio dirty state is just a hint, + * and the buffer dirty state is definitive. (A hint which has rules: + * dirty buffers against a clean folio is illegal. Other combinations are + * legal and need to be handled. In particular a dirty folio containing + * clean buffers for example.) * * Return 0 on success and -errno on error. * * Based on ntfs_read_block() and __block_write_full_folio(). */ -static int ntfs_write_block(struct page *page, struct writeback_control *wbc) +static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) { VCN vcn; LCN lcn; @@ -546,41 +541,29 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) bool need_end_writeback; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = folio->mapping->host; ni = NTFS_I(vi); vol = ni->vol; ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", ni->mft_no, ni->type, page->index); + "0x%lx.", ni->mft_no, ni->type, folio->index); BUG_ON(!NInoNonResident(ni)); BUG_ON(NInoMstProtected(ni)); blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - if (!page_has_buffers(page)) { - BUG_ON(!PageUptodate(page)); - create_empty_buffers(page, blocksize, + head = folio_buffers(folio); + if (!head) { + BUG_ON(!folio_test_uptodate(folio)); + head = create_empty_buffers(folio, blocksize, (1 << BH_Uptodate) | (1 << BH_Dirty)); - if (unlikely(!page_has_buffers(page))) { - ntfs_warning(vol->sb, "Error allocating page " - "buffers. Redirtying page so we try " - "again later."); - /* - * Put the page back on mapping->dirty_pages, but leave - * its buffers' dirty state as-is. - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } } - bh = head = page_buffers(page); - BUG_ON(!bh); + bh = head; /* NOTE: Different naming scheme to ntfs_read_block()! */ - /* The first block in the page. */ - block = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + /* The first block in the folio. */ + block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); i_size = i_size_read(vi); @@ -597,14 +580,14 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. + * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. */ /* - * Loop through all the buffers in the page, mapping all the dirty + * Loop through all the buffers in the folio, mapping all the dirty * buffers to disk addresses and handling any aliases from the * underlying block device's mapping. */ @@ -616,13 +599,13 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) if (unlikely(block >= dblock)) { /* * Mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a + * this folio can be outside i_size when there is a * truncate in progress. The contents of such buffers * were zeroed by ntfs_writepage(). * * FIXME: What about the small race window where * ntfs_writepage() has not done any clearing because - * the page was within i_size but before we get here, + * the folio was within i_size but before we get here, * vmtruncate() modifies i_size? */ clear_buffer_dirty(bh); @@ -638,38 +621,38 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) if (unlikely((block >= iblock) && (initialized_size < i_size))) { /* - * If this page is fully outside initialized - * size, zero out all pages between the current - * initialized size and the current page. Just + * If this folio is fully outside initialized + * size, zero out all folios between the current + * initialized size and the current folio. Just * use ntfs_read_folio() to do the zeroing * transparently. */ if (block > iblock) { // TODO: - // For each page do: - // - read_cache_page() - // Again for each page do: - // - wait_on_page_locked() - // - Check (PageUptodate(page) && - // !PageError(page)) + // For each folio do: + // - read_cache_folio() + // Again for each folio do: + // - wait_on_folio_locked() + // - Check (folio_test_uptodate(folio) && + // !folio_test_error(folio)) // Update initialized size in the attribute and // in the inode. - // Again, for each page do: + // Again, for each folio do: // block_dirty_folio(); - // put_page() + // folio_put() // We don't need to wait on the writes. // Update iblock. } /* - * The current page straddles initialized size. Zero + * The current folio straddles initialized size. Zero * all non-uptodate buffers and set them uptodate (and * dirty?). Note, there aren't any non-uptodate buffers - * if the page is uptodate. - * FIXME: For an uptodate page, the buffers may need to + * if the folio is uptodate. + * FIXME: For an uptodate folio, the buffers may need to * be written out because they were not initialized on * disk before. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { // TODO: // Zero any non-uptodate buffers up to i_size. // Set them uptodate and dirty. @@ -727,14 +710,14 @@ lock_retry_remap: unsigned long *bpos, *bend; /* Check if the buffer is zero. */ - kaddr = kmap_atomic(page); - bpos = (unsigned long *)(kaddr + bh_offset(bh)); - bend = (unsigned long *)((u8*)bpos + blocksize); + kaddr = kmap_local_folio(folio, bh_offset(bh)); + bpos = (unsigned long *)kaddr; + bend = (unsigned long *)(kaddr + blocksize); do { if (unlikely(*bpos)) break; } while (likely(++bpos < bend)); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (bpos == bend) { /* * Buffer is zero and sparse, no need to write @@ -774,7 +757,7 @@ lock_retry_remap: if (err == -ENOENT || lcn == LCN_ENOENT) { bh->b_blocknr = -1; clear_buffer_dirty(bh); - zero_user(page, bh_offset(bh), blocksize); + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); err = 0; continue; @@ -801,7 +784,7 @@ lock_retry_remap: bh = head; /* Just an optimization, so ->read_folio() is not called later. */ - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { int uptodate = 1; do { if (!buffer_uptodate(bh)) { @@ -811,7 +794,7 @@ lock_retry_remap: } } while ((bh = bh->b_this_page) != head); if (uptodate) - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* Setup all mapped, dirty buffers for async write i/o. */ @@ -826,7 +809,7 @@ lock_retry_remap: } else if (unlikely(err)) { /* * For the error case. The buffer may have been set - * dirty during attachment to a dirty page. + * dirty during attachment to a dirty folio. */ if (err != -ENOMEM) clear_buffer_dirty(bh); @@ -839,20 +822,20 @@ lock_retry_remap: err = 0; else if (err == -ENOMEM) { ntfs_warning(vol->sb, "Error allocating memory. " - "Redirtying page so we try again " + "Redirtying folio so we try again " "later."); /* - * Put the page back on mapping->dirty_pages, but + * Put the folio back on mapping->dirty_pages, but * leave its buffer's dirty state as-is. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); err = 0; } else - SetPageError(page); + folio_set_error(folio); } - BUG_ON(PageWriteback(page)); - set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */ /* Submit the prepared buffers for i/o. */ need_end_writeback = true; @@ -864,11 +847,11 @@ lock_retry_remap: } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); - /* If no i/o was started, need to end_page_writeback(). */ + /* If no i/o was started, need to end writeback here. */ if (unlikely(need_end_writeback)) - end_page_writeback(page); + folio_end_writeback(folio); ntfs_debug("Done."); return err; @@ -1337,8 +1320,9 @@ done: */ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); loff_t i_size; - struct inode *vi = page->mapping->host; + struct inode *vi = folio->mapping->host; ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); char *addr; ntfs_attr_search_ctx *ctx = NULL; @@ -1347,14 +1331,13 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) int err; retry_writepage: - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); i_size = i_size_read(vi); - /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + /* Is the folio fully outside i_size? (truncate in progress) */ + if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT)) { - struct folio *folio = page_folio(page); /* - * The page may have dirty, unmapped buffers. Make them + * The folio may have dirty, unmapped buffers. Make them * freeable here, so the page does not leak. */ block_invalidate_folio(folio, 0, folio_size(folio)); @@ -1373,7 +1356,7 @@ retry_writepage: if (ni->type != AT_INDEX_ALLOCATION) { /* If file is encrypted, deny access, just like NT4. */ if (NInoEncrypted(ni)) { - unlock_page(page); + folio_unlock(folio); BUG_ON(ni->type != AT_DATA); ntfs_debug("Denying write access to encrypted file."); return -EACCES; @@ -1384,14 +1367,14 @@ retry_writepage: BUG_ON(ni->name_len); // TODO: Implement and replace this with // return ntfs_write_compressed_block(page); - unlock_page(page); + folio_unlock(folio); ntfs_error(vi->i_sb, "Writing to compressed files is " "not supported yet. Sorry."); return -EOPNOTSUPP; } // TODO: Implement and remove this check. if (NInoNonResident(ni) && NInoSparse(ni)) { - unlock_page(page); + folio_unlock(folio); ntfs_error(vi->i_sb, "Writing to sparse files is not " "supported yet. Sorry."); return -EOPNOTSUPP; @@ -1400,34 +1383,34 @@ retry_writepage: /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* We have to zero every time due to mmap-at-end-of-file. */ - if (page->index >= (i_size >> PAGE_SHIFT)) { - /* The page straddles i_size. */ - unsigned int ofs = i_size & ~PAGE_MASK; - zero_user_segment(page, ofs, PAGE_SIZE); + if (folio->index >= (i_size >> PAGE_SHIFT)) { + /* The folio straddles i_size. */ + unsigned int ofs = i_size & (folio_size(folio) - 1); + folio_zero_segment(folio, ofs, folio_size(folio)); } /* Handle mst protected attributes. */ if (NInoMstProtected(ni)) return ntfs_write_mst_block(page, wbc); /* Normal, non-resident data stream. */ - return ntfs_write_block(page, wbc); + return ntfs_write_block(folio, wbc); } /* * Attribute is resident, implying it is not compressed, encrypted, or * mst protected. This also means the attribute is smaller than an mft - * record and hence smaller than a page, so can simply return error on - * any pages with index above 0. Note the attribute can actually be + * record and hence smaller than a folio, so can simply return error on + * any folios with index above 0. Note the attribute can actually be * marked compressed but if it is resident the actual data is not * compressed so we are ok to ignore the compressed flag here. */ - BUG_ON(page_has_buffers(page)); - BUG_ON(!PageUptodate(page)); - if (unlikely(page->index > 0)) { - ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. " - "Aborting write.", page->index); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); + BUG_ON(folio_buffers(folio)); + BUG_ON(!folio_test_uptodate(folio)); + if (unlikely(folio->index > 0)) { + ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. " + "Aborting write.", folio->index); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); return -EIO; } if (!NInoAttr(ni)) @@ -1460,12 +1443,12 @@ retry_writepage: if (unlikely(err)) goto err_out; /* - * Keep the VM happy. This must be done otherwise the radix-tree tag - * PAGECACHE_TAG_DIRTY remains set even though the page is clean. + * Keep the VM happy. This must be done otherwise + * PAGECACHE_TAG_DIRTY remains set even though the folio is clean. */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); i_size = i_size_read(vi); if (unlikely(attr_len > i_size)) { @@ -1480,18 +1463,18 @@ retry_writepage: /* Shrinking cannot fail. */ BUG_ON(err); } - addr = kmap_atomic(page); - /* Copy the data from the page to the mft record. */ + addr = kmap_local_folio(folio, 0); + /* Copy the data from the folio to the mft record. */ memcpy((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset), addr, attr_len); - /* Zero out of bounds area in the page cache page. */ - memset(addr + attr_len, 0, PAGE_SIZE - attr_len); - kunmap_atomic(addr); - flush_dcache_page(page); + /* Zero out of bounds area in the page cache folio. */ + memset(addr + attr_len, 0, folio_size(folio) - attr_len); + kunmap_local(addr); + flush_dcache_folio(folio); flush_dcache_mft_record_page(ctx->ntfs_ino); - /* We are done with the page. */ - end_page_writeback(page); + /* We are done with the folio. */ + folio_end_writeback(folio); /* Finally, mark the mft record dirty, so it gets written back. */ mark_mft_record_dirty(ctx->ntfs_ino); ntfs_attr_put_search_ctx(ctx); @@ -1502,18 +1485,18 @@ err_out: ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " "page so we try again later."); /* - * Put the page back on mapping->dirty_pages, but leave its + * Put the folio back on mapping->dirty_pages, but leave its * buffers' dirty state as-is. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " "error %i.", err); - SetPageError(page); + folio_set_error(folio); NVolSetErrors(ni->vol); } - unlock_page(page); + folio_unlock(folio); if (ctx) ntfs_attr_put_search_ctx(ctx); if (m) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index cbc545999cfe..297c0b9db621 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -567,7 +567,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, LCN lcn; s64 bh_pos, vcn_len, end, initialized_size; sector_t lcn_block; - struct page *page; + struct folio *folio; struct inode *vi; ntfs_inode *ni, *base_ni = NULL; ntfs_volume *vol; @@ -601,20 +601,6 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, (long long)pos, bytes); blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - u = 0; - do { - page = pages[u]; - BUG_ON(!page); - /* - * create_empty_buffers() will create uptodate/dirty buffers if - * the page is uptodate/dirty. - */ - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, 0); - if (unlikely(!page_has_buffers(page))) - return -ENOMEM; - } - } while (++u < nr_pages); rl_write_locked = false; rl = NULL; err = 0; @@ -626,14 +612,21 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, end = pos + bytes; cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; /* - * Loop over each page and for each page over each buffer. Use goto to + * Loop over each buffer in each folio. Use goto to * reduce indentation. */ u = 0; -do_next_page: - page = pages[u]; - bh_pos = (s64)page->index << PAGE_SHIFT; - bh = head = page_buffers(page); +do_next_folio: + folio = page_folio(pages[u]); + bh_pos = folio_pos(folio); + head = folio_buffers(folio); + if (!head) + /* + * create_empty_buffers() will create uptodate/dirty + * buffers if the folio is uptodate/dirty. + */ + head = create_empty_buffers(folio, blocksize, 0); + bh = head; do { VCN cdelta; s64 bh_end; @@ -653,15 +646,15 @@ do_next_page: if (buffer_uptodate(bh)) continue; /* - * The buffer is not uptodate. If the page is uptodate + * The buffer is not uptodate. If the folio is uptodate * set the buffer uptodate and otherwise ignore it. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } /* - * Neither the page nor the buffer are uptodate. If + * Neither the folio nor the buffer are uptodate. If * the buffer is only partially being written to, we * need to read it in before the write, i.e. now. */ @@ -679,7 +672,7 @@ do_next_page: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -706,7 +699,7 @@ map_buffer_cached: (bh_cofs >> blocksize_bits); set_buffer_mapped(bh); /* - * If the page is uptodate so is the buffer. If the + * If the folio is uptodate so is the buffer. If the * buffer is fully outside the write, we ignore it if * it was already allocated and we mark it dirty so it * gets written out if we allocated it. On the other @@ -714,7 +707,7 @@ map_buffer_cached: * marking it dirty we set buffer_new so we can do * error recovery. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); if (unlikely(was_hole)) { @@ -754,7 +747,8 @@ map_buffer_cached: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, + bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -773,7 +767,7 @@ map_buffer_cached: */ if (bh_end <= pos || bh_pos >= end) { if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -786,7 +780,7 @@ map_buffer_cached: u8 *kaddr; unsigned pofs; - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); if (bh_pos < pos) { pofs = bh_pos & ~PAGE_MASK; memset(kaddr + pofs, 0, pos - bh_pos); @@ -795,8 +789,8 @@ map_buffer_cached: pofs = end & ~PAGE_MASK; memset(kaddr + pofs, 0, bh_end - end); } - kunmap_atomic(kaddr); - flush_dcache_page(page); + kunmap_local(kaddr); + flush_dcache_folio(folio); } continue; } @@ -809,11 +803,12 @@ map_buffer_cached: initialized_size = ni->allocated_size; read_unlock_irqrestore(&ni->size_lock, flags); if (bh_pos > initialized_size) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), blocksize); + folio_zero_range(folio, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } continue; @@ -927,17 +922,17 @@ rl_not_mapped_enoent: bh->b_blocknr = -1; /* * If the buffer is uptodate we skip it. If it - * is not but the page is uptodate, we can set - * the buffer uptodate. If the page is not + * is not but the folio is uptodate, we can set + * the buffer uptodate. If the folio is not * uptodate, we can clear the buffer and set it * uptodate. Whether this is worthwhile is * debatable and this could be removed. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -1167,7 +1162,7 @@ rl_not_mapped_enoent: } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); /* If there are no errors, do the next page. */ if (likely(!err && ++u < nr_pages)) - goto do_next_page; + goto do_next_folio; /* If there are no errors, release the runlist lock if we took it. */ if (likely(!err)) { if (unlikely(rl_write_locked)) { @@ -1185,9 +1180,8 @@ rl_not_mapped_enoent: bh = *--wait_bh; wait_on_buffer(bh); if (likely(buffer_uptodate(bh))) { - page = bh->b_page; - bh_pos = ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh); + folio = bh->b_folio; + bh_pos = folio_pos(folio) + bh_offset(bh); /* * If the buffer overflows the initialized size, need * to zero the overflowing region. @@ -1197,7 +1191,7 @@ rl_not_mapped_enoent: if (likely(bh_pos < initialized_size)) ofs = initialized_size - bh_pos; - zero_user_segment(page, bh_offset(bh) + ofs, + folio_zero_segment(folio, bh_offset(bh) + ofs, blocksize); } } else /* if (unlikely(!buffer_uptodate(bh))) */ @@ -1324,21 +1318,20 @@ rl_not_mapped_enoent: u = 0; end = bh_cpos << vol->cluster_size_bits; do { - page = pages[u]; - bh = head = page_buffers(page); + folio = page_folio(pages[u]); + bh = head = folio_buffers(folio); do { if (u == nr_pages && - ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh) >= end) + folio_pos(folio) + bh_offset(bh) >= end) break; if (!buffer_new(bh)) continue; clear_buffer_new(bh); if (!buffer_uptodate(bh)) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 6c3f38d66579..aba1e22db4e9 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -648,18 +648,18 @@ static int ntfs_read_locked_inode(struct inode *vi) * mtime is the last change of the data within the file. Not changed * when only metadata is changed, e.g. a rename doesn't affect mtime. */ - vi->i_mtime = ntfs2utc(si->last_data_change_time); + inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time)); /* * ctime is the last change of the metadata of the file. This obviously * always changes, when mtime is changed. ctime can be changed on its * own, mtime is then not changed, e.g. when a file is renamed. */ - vi->i_ctime = ntfs2utc(si->last_mft_change_time); + inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time)); /* * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. */ - vi->i_atime = ntfs2utc(si->last_access_time); + inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time)); /* Find the attribute list attribute if present. */ ntfs_attr_reinit_search_ctx(ctx); @@ -1217,9 +1217,9 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); - vi->i_mtime = base_vi->i_mtime; - vi->i_ctime = base_vi->i_ctime; - vi->i_atime = base_vi->i_atime; + inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); + inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); vi->i_generation = ni->seq_no = base_ni->seq_no; /* Set inode type to zero but preserve permissions. */ @@ -1483,9 +1483,9 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); - vi->i_mtime = base_vi->i_mtime; - vi->i_ctime = base_vi->i_ctime; - vi->i_atime = base_vi->i_atime; + inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); + inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); vi->i_generation = ni->seq_no = base_ni->seq_no; /* Set inode type to zero but preserve permissions. */ vi->i_mode = base_vi->i_mode & ~S_IFMT; @@ -2804,13 +2804,15 @@ done: */ if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { struct timespec64 now = current_time(VFS_I(base_ni)); + struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); + struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni)); int sync_it = 0; - if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) || - !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now)) + if (!timespec64_equal(&mtime, &now) || + !timespec64_equal(&ctime, &now)) sync_it = 1; - VFS_I(base_ni)->i_mtime = now; - VFS_I(base_ni)->i_ctime = now; + inode_set_ctime_to_ts(VFS_I(base_ni), now); + inode_set_mtime_to_ts(VFS_I(base_ni), now); if (sync_it) mark_inode_dirty_sync(VFS_I(base_ni)); @@ -2924,11 +2926,11 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } } if (ia_valid & ATTR_ATIME) - vi->i_atime = attr->ia_atime; + inode_set_atime_to_ts(vi, attr->ia_atime); if (ia_valid & ATTR_MTIME) - vi->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(vi, attr->ia_mtime); if (ia_valid & ATTR_CTIME) - vi->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(vi, attr->ia_ctime); mark_inode_dirty(vi); out: return err; @@ -2995,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si = (STANDARD_INFORMATION*)((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); /* Update the access times if they have changed. */ - nt = utc2ntfs(vi->i_mtime); + nt = utc2ntfs(inode_get_mtime(vi)); if (si->last_data_change_time != nt) { ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3004,7 +3006,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_data_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_ctime); + nt = utc2ntfs(inode_get_ctime(vi)); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3013,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_mft_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_atime); + nt = utc2ntfs(inode_get_atime(vi)); if (si->last_access_time != nt) { ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 0155f106ec34..6fd1dc4b08c8 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2682,8 +2682,7 @@ mft_rec_already_initialized: vi->i_mode &= ~S_IWUGO; /* Set the inode times to the current time. */ - vi->i_atime = vi->i_mtime = vi->i_ctime = - current_time(vi); + simple_inode_init_ts(vi); /* * Set the file size to 0, the ntfs inode sizes are set to 0 by * the call to ntfs_init_big_inode() below. diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index ab44f2db533b..d7498ddc4a72 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -384,6 +384,7 @@ static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, * and due to using iget() whereas NTFS needs ntfs_iget(). */ const struct export_operations ntfs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .get_parent = ntfs_get_parent, /* Find the parent of a given directory. */ .fh_to_dentry = ntfs_fh_to_dentry, diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig index 96cc236f7f7b..cdfdf51e55d7 100644 --- a/fs/ntfs3/Kconfig +++ b/fs/ntfs3/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NTFS3_FS tristate "NTFS Read-Write file system support" + select BUFFER_HEAD select NLS select LEGACY_DIRECT_IO help diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index a9d82bbb4729..63f70259edc0 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -1106,10 +1106,10 @@ repack: } } - /* + /* * The code below may require additional cluster (to extend attribute list) - * and / or one MFT record - * It is too complex to undo operations if -ENOSPC occurs deep inside + * and / or one MFT record + * It is too complex to undo operations if -ENOSPC occurs deep inside * in 'ni_insert_nonresident'. * Return in advance -ENOSPC here if there are no free cluster and no free MFT. */ @@ -1736,10 +1736,8 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) { - err = -ENOENT; - goto out; - } + if (!attr_b) + return -ENOENT; attr = attr_b; le = le_b; diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 42631b31adf1..7c01735d1219 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -52,7 +52,8 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (!attr->non_res) { lsize = le32_to_cpu(attr->res.data_size); - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is resident: lsize < record_size (1K or 4K) */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; @@ -80,7 +81,17 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (err < 0) goto out; - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is nonresident. + * The worst case: + * 1T (2^40) extremely fragmented file. + * cluster = 4K (2^12) => 2^28 fragments + * 2^9 fragments per one record => 2^19 records + * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes. + * + * the result is 16M bytes per attribute list. + * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes] + */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 107e808e06ea..63f14a0232f6 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -125,6 +125,7 @@ void wnd_close(struct wnd_bitmap *wnd) struct rb_node *node, *next; kfree(wnd->free_bits); + wnd->free_bits = NULL; run_close(&wnd->run); node = rb_first(&wnd->start_tree); @@ -659,7 +660,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) wnd->bits_last = wbits; wnd->free_bits = - kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); + kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); + if (!wnd->free_bits) return -ENOMEM; diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 063a6654199b..ec0566b322d5 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,7 +309,11 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ + if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) + dt_type = DT_LNK; + else + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 1d6c824246c4..a5a30a24ce5d 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -85,7 +85,7 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->result_mask |= STATX_BTIME; stat->btime = ni->i_crtime; @@ -187,7 +187,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) struct buffer_head *head, *bh; u32 bh_next, bh_off, to; sector_t iblock; - struct page *page; + struct folio *folio; for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; @@ -195,16 +195,17 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) PAGE_SIZE; iblock = page_off >> inode->i_blkbits; - page = find_or_create_page(mapping, idx, - mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, idx, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, blocksize, 0); - bh = head = page_buffers(page); + bh = head; bh_off = 0; do { bh_next = bh_off + blocksize; @@ -220,14 +221,14 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) } /* Ok, it's mapped. Make sure it's up-to-date. */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = bh_read(bh, 0); if (err < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } } @@ -237,10 +238,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); - zero_user_segment(page, from, to); + folio_zero_segment(folio, from, to); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); cond_resched(); } out: @@ -342,7 +343,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, err = 0; } - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); if (IS_SYNC(inode)) { @@ -400,7 +401,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) ni_unlock(ni); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (!IS_DIRSYNC(inode)) { dirty = 1; } else { @@ -642,7 +643,7 @@ out: filemap_invalidate_unlock(mapping); if (!err) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); } @@ -745,8 +746,8 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, unsigned int flags) + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 16bd9faa2d28..3df2d9e34b91 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -2148,7 +2148,7 @@ out1: for (i = 0; i < pages_per_frame; i++) { pg = pages[i]; - if (i == idx) + if (i == idx || !pg) continue; unlock_page(pg); put_page(pg); @@ -3208,6 +3208,12 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup))) continue; + /* Check simple case when parent inode equals current inode. */ + if (ino_get(&fname->home) == ni->vfs_inode.i_ino) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + continue; + } + /* ntfs_iget5 may sleep. */ dir = ntfs_iget5(sb, &fname->home, NULL); if (IS_ERR(dir)) { @@ -3265,6 +3271,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; + struct timespec64 ts; /* Update times in standard attribute. */ std = ni_std(ni); @@ -3274,19 +3281,22 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) } /* Update the access times if they have changed. */ - dup.m_time = kernel2nt(&inode->i_mtime); + ts = inode_get_mtime(inode); + dup.m_time = kernel2nt(&ts); if (std->m_time != dup.m_time) { std->m_time = dup.m_time; modified = true; } - dup.c_time = kernel2nt(&inode->i_ctime); + ts = inode_get_mtime(inode); + dup.c_time = kernel2nt(&ts); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; modified = true; } - dup.a_time = kernel2nt(&inode->i_atime); + ts = inode_get_atime(inode); + dup.a_time = kernel2nt(&ts); if (std->a_time != dup.a_time) { std->a_time = dup.a_time; modified = true; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 12f28cdf5c83..98ccb6650858 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -2168,8 +2168,10 @@ file_is_valid: if (!page) { page = kmalloc(log->page_size, GFP_NOFS); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto out; + } } /* diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 33afee0f5559..fbfe21dbb425 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -983,18 +983,11 @@ out: if (err) return err; - mark_inode_dirty(&ni->vfs_inode); + mark_inode_dirty_sync(&ni->vfs_inode); /* verify(!ntfs_update_mftmirr()); */ - /* - * If we used wait=1, sync_inode_metadata waits for the io for the - * inode to finish. It hangs when media is removed. - * So wait=0 is sent down to sync_inode_metadata - * and filemap_fdatawrite is used for the data blocks. - */ - err = sync_inode_metadata(&ni->vfs_inode, 0); - if (!err) - err = filemap_fdatawrite(ni->vfs_inode.i_mapping); + /* write mft record on disk. */ + err = _ni_write_inode(&ni->vfs_inode, 1); return err; } @@ -2461,10 +2454,12 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) { CLST end, i, zone_len, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; + bool dirty = false; down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); if (!wnd_is_used(wnd, lcn, len)) { - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + /* mark volume as dirty out of wnd->rw_lock */ + dirty = true; end = lcn + len; len = 0; @@ -2518,6 +2513,8 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) out: up_write(&wnd->rw_lock); + if (dirty) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } /* diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 124c6e822623..cf92b2433f7a 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -729,6 +729,9 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, u32 total = le32_to_cpu(hdr->total); u16 offs[128]; + if (unlikely(!cmp)) + return NULL; + fill_table: if (end > total) return NULL; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index dc7e7ab701c6..5e3d71374918 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -44,6 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, u64 t64; struct MFT_REC *rec; struct runs_tree *run; + struct timespec64 ts; inode->i_op = NULL; /* Setup 'uid' and 'gid' */ @@ -168,9 +169,12 @@ next_attr: #ifdef STATX_BTIME nt2kernel(std5->cr_time, &ni->i_crtime); #endif - nt2kernel(std5->a_time, &inode->i_atime); - nt2kernel(std5->c_time, &inode->i_ctime); - nt2kernel(std5->m_time, &inode->i_mtime); + nt2kernel(std5->a_time, &ts); + inode_set_atime_to_ts(inode, ts); + nt2kernel(std5->c_time, &ts); + inode_set_ctime_to_ts(inode, ts); + nt2kernel(std5->m_time, &ts); + inode_set_mtime_to_ts(inode, ts); ni->std_fa = std5->fa; @@ -554,7 +558,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; u8 cluster_bits = sbi->cluster_bits; u32 block_size = sb->s_blocksize; u64 bytes, lbo, valid; @@ -569,7 +573,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, if (is_resident(ni)) { ni_lock(ni); - err = attr_data_read_resident(ni, page); + err = attr_data_read_resident(ni, &folio->page); ni_unlock(ni); if (!err) @@ -642,17 +646,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, */ bytes = block_size; - if (page) { + if (folio) { u32 voff = valid - vbo; bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); - set_bh_page(bh, page, off); + folio_set_bh(bh, folio, off); err = bh_read(bh, 0); if (err < 0) goto out; - zero_user_segment(page, off + voff, off + block_size); + folio_zero_segment(folio, off + voff, off + block_size); } } @@ -958,7 +962,8 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, if (err >= 0) { if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; dirty = true; } @@ -1658,8 +1663,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ - inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime = - dir->i_ctime = ni->i_crtime; + inode_set_atime_to_ts(inode, ni->i_crtime); + inode_set_ctime_to_ts(inode, ni->i_crtime); + inode_set_mtime_to_ts(inode, ni->i_crtime); + inode_set_mtime_to_ts(dir, ni->i_crtime); + inode_set_ctime_to_ts(dir, ni->i_crtime); mark_inode_dirty(dir); mark_inode_dirty(inode); @@ -1765,9 +1773,9 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) if (!err) { drop_nlink(inode); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); if (inode->i_nlink) mark_inode_dirty(inode); } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 70f8c859e0ad..ee3093be5170 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) err = ntfs_link_inode(inode, de); if (!err) { - dir->i_ctime = dir->i_mtime = inode->i_ctime = - current_time(dir); + inode_set_ctime_current(inode); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(inode); mark_inode_dirty(dir); d_instantiate(de, inode); @@ -324,14 +324,11 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, /* Restore after failed rename failed too. */ _ntfs_bad_inode(inode); } else if (!err) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = - current_time(dir); + simple_rename_timestamp(dir, dentry, new_dir, new_dentry); mark_inode_dirty(inode); mark_inode_dirty(dir); - if (dir != new_dir) { - new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime; + if (dir != new_dir) mark_inode_dirty(new_dir); - } if (IS_DIRSYNC(dir)) ntfs_sync_inode(dir); @@ -376,7 +373,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (IS_POSIXACL(dir)) { - /* + /* * Load in cache current acl to avoid ni_lock(dir): * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create -> * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 98b76d1b09e7..86aecbb01a92 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -847,7 +847,7 @@ struct OBJECT_ID { // Birth Volume Id is the Object Id of the Volume on. // which the Object Id was allocated. It never changes. struct GUID BirthVolumeId; //0x10: - + // Birth Object Id is the first Object Id that was // ever assigned to this MFT Record. I.e. If the Object Id // is changed for some reason, this field will reflect the diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 629403ede6e5..f6706143d14b 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -42,9 +42,11 @@ enum utf16_endian; #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 +#define MAXIMUM_SHIFT_BYTES_PER_MFT 12 #define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) #define MAXIMUM_BYTES_PER_INDEX 4096 +#define MAXIMUM_SHIFT_BYTES_PER_INDEX 12 #define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) /* NTFS specific error code when fixup failed. */ @@ -495,8 +497,6 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, - CLST len); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -872,7 +872,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern const struct xattr_handler *ntfs_xattr_handlers[]; +extern const struct xattr_handler * const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index c12ebffc94da..53629b1f65e9 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -189,12 +189,19 @@ out: return err; } +/* + * mi_enum_attr - start/continue attributes enumeration in record. + * + * NOTE: mi->mrec - memory of size sbi->record_size + * here we sure that mi->mrec->total == sbi->record_size (see mi_read) + */ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) { const struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); - u32 t32, off, asize; + u32 t32, off, asize, prev_type; u16 t16; + u64 data_size, alloc_size, tot_size; if (!attr) { u32 total = le32_to_cpu(rec->total); @@ -213,6 +220,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (!is_rec_inuse(rec)) return NULL; + prev_type = 0; attr = Add2Ptr(rec, off); } else { /* Check if input attr inside record. */ @@ -226,11 +234,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return NULL; } - if (off + asize < off) { - /* Overflow check. */ + /* Overflow check. */ + if (off + asize < off) return NULL; - } + prev_type = le32_to_cpu(attr->type); attr = Add2Ptr(attr, asize); off += asize; } @@ -250,7 +258,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* 0x100 is last known attribute for now. */ t32 = le32_to_cpu(attr->type); - if ((t32 & 0xf) || (t32 > 0x100)) + if (!t32 || (t32 & 0xf) || (t32 > 0x100)) + return NULL; + + /* attributes in record must be ordered by type */ + if (t32 < prev_type) return NULL; /* Check overflow and boundary. */ @@ -259,16 +271,15 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* Check size of attribute. */ if (!attr->non_res) { + /* Check resident fields. */ if (asize < SIZEOF_RESIDENT) return NULL; t16 = le16_to_cpu(attr->res.data_off); - if (t16 > asize) return NULL; - t32 = le32_to_cpu(attr->res.data_size); - if (t16 + t32 > asize) + if (t16 + le32_to_cpu(attr->res.data_size) > asize) return NULL; t32 = sizeof(short) * attr->name_len; @@ -278,21 +289,52 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return attr; } - /* Check some nonresident fields. */ - if (attr->name_len && - le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > - le16_to_cpu(attr->nres.run_off)) { + /* Check nonresident fields. */ + if (attr->non_res != 1) + return NULL; + + t16 = le16_to_cpu(attr->nres.run_off); + if (t16 > asize) + return NULL; + + t32 = sizeof(short) * attr->name_len; + if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) + return NULL; + + /* Check start/end vcn. */ + if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1) return NULL; - } - if (attr->nres.svcn || !is_attr_ext(attr)) { + data_size = le64_to_cpu(attr->nres.data_size); + if (le64_to_cpu(attr->nres.valid_size) > data_size) + return NULL; + + alloc_size = le64_to_cpu(attr->nres.alloc_size); + if (data_size > alloc_size) + return NULL; + + t32 = mi->sbi->cluster_mask; + if (alloc_size & t32) + return NULL; + + if (!attr->nres.svcn && is_attr_ext(attr)) { + /* First segment of sparse/compressed attribute */ + if (asize + 8 < SIZEOF_NONRESIDENT_EX) + return NULL; + + tot_size = le64_to_cpu(attr->nres.total_size); + if (tot_size & t32) + return NULL; + + if (tot_size > alloc_size) + return NULL; + } else { if (asize + 8 < SIZEOF_NONRESIDENT) return NULL; if (attr->nres.c_unit) return NULL; - } else if (asize + 8 < SIZEOF_NONRESIDENT_EX) - return NULL; + } return attr; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 1a02072b6b0e..9153dffde950 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -453,15 +453,23 @@ static struct proc_dir_entry *proc_info_root; * ntfs3.1 * cluster size * number of clusters + * total number of mft records + * number of used mft records ~= number of files + folders + * real state of ntfs "dirty"/"clean" + * current state of ntfs "dirty"/"clean" */ static int ntfs3_volinfo(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; - seq_printf(m, "ntfs%d.%d\n%u\n%zu\n", sbi->volume.major_ver, - sbi->volume.minor_ver, sbi->cluster_size, - sbi->used.bitmap.nbits); + seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n", + sbi->volume.major_ver, sbi->volume.minor_ver, + sbi->cluster_size, sbi->used.bitmap.nbits, + sbi->mft.bitmap.nbits, + sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap), + sbi->volume.real_dirty ? "dirty" : "clean", + (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean"); return 0; } @@ -488,9 +496,13 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, { int err; struct super_block *sb = pde_data(file_inode(file)); - struct ntfs_sb_info *sbi = sb->s_fs_info; ssize_t ret = count; - u8 *label = kmalloc(count, GFP_NOFS); + u8 *label; + + if (sb_rdonly(sb)) + return -EROFS; + + label = kmalloc(count, GFP_NOFS); if (!label) return -ENOMEM; @@ -502,7 +514,7 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, while (ret > 0 && label[ret - 1] == '\n') ret -= 1; - err = ntfs_set_label(sbi, label, ret); + err = ntfs_set_label(sb->s_fs_info, label, ret); if (err < 0) { ntfs_err(sb, "failed (%d) to write label", err); @@ -569,31 +581,37 @@ static void init_once(void *foo) } /* - * put_ntfs - Noinline to reduce binary size. + * Noinline to reduce binary size. */ -static noinline void put_ntfs(struct ntfs_sb_info *sbi) +static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi) { - kfree(sbi->new_rec); - kvfree(ntfs_put_shared(sbi->upcase)); - kfree(sbi->def_table); - wnd_close(&sbi->mft.bitmap); wnd_close(&sbi->used.bitmap); - if (sbi->mft.ni) + if (sbi->mft.ni) { iput(&sbi->mft.ni->vfs_inode); + sbi->mft.ni = NULL; + } - if (sbi->security.ni) + if (sbi->security.ni) { iput(&sbi->security.ni->vfs_inode); + sbi->security.ni = NULL; + } - if (sbi->reparse.ni) + if (sbi->reparse.ni) { iput(&sbi->reparse.ni->vfs_inode); + sbi->reparse.ni = NULL; + } - if (sbi->objid.ni) + if (sbi->objid.ni) { iput(&sbi->objid.ni->vfs_inode); + sbi->objid.ni = NULL; + } - if (sbi->volume.ni) + if (sbi->volume.ni) { iput(&sbi->volume.ni->vfs_inode); + sbi->volume.ni = NULL; + } ntfs_update_mftmirr(sbi, 0); @@ -601,6 +619,13 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi) indx_clear(&sbi->security.index_sdh); indx_clear(&sbi->reparse.index_r); indx_clear(&sbi->objid.index_o); +} + +static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) +{ + kfree(sbi->new_rec); + kvfree(ntfs_put_shared(sbi->upcase)); + kfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); @@ -625,12 +650,7 @@ static void ntfs_put_super(struct super_block *sb) /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); - - put_mount_options(sbi->options); - put_ntfs(sbi); - sb->s_fs_info = NULL; - - sync_blockdev(sb->s_bdev); + ntfs3_put_sbi(sbi); } static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -791,6 +811,7 @@ static int ntfs_nfs_commit_metadata(struct inode *inode) } static const struct export_operations ntfs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ntfs_fh_to_dentry, .fh_to_parent = ntfs_fh_to_parent, .get_parent = ntfs3_get_parent, @@ -838,7 +859,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2, dev_size0; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -847,6 +868,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u32 boot_off = 0; const char *hint = "Primary boot"; + /* Save original dev_size. Used with alternative boot. */ + dev_size0 = dev_size; + sbi->volume.blocks = dev_size >> PAGE_SHIFT; bh = ntfs_bread(sb, 0); @@ -855,6 +879,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, check_boot: err = -EINVAL; + + /* Corrupted image; do not read OOB */ + if (bh->b_size - sizeof(*boot) < boot_off) + goto out; + boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off); if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { @@ -901,9 +930,17 @@ check_boot: goto out; } - sbi->record_size = record_size = - boot->record_size < 0 ? 1 << (-boot->record_size) : - (u32)boot->record_size << cluster_bits; + if (boot->record_size >= 0) { + record_size = (u32)boot->record_size << cluster_bits; + } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) { + record_size = 1u << (-boot->record_size); + } else { + ntfs_err(sb, "%s: invalid record size %d.", hint, + boot->record_size); + goto out; + } + + sbi->record_size = record_size; sbi->record_bits = blksize_bits(record_size); sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes @@ -920,9 +957,15 @@ check_boot: goto out; } - sbi->index_size = boot->index_size < 0 ? - 1u << (-boot->index_size) : - (u32)boot->index_size << cluster_bits; + if (boot->index_size >= 0) { + sbi->index_size = (u32)boot->index_size << cluster_bits; + } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) { + sbi->index_size = 1u << (-boot->index_size); + } else { + ntfs_err(sb, "%s: invalid index size %d.", hint, + boot->index_size); + goto out; + } /* Check index record size. */ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { @@ -1057,17 +1100,17 @@ check_boot: if (bh->b_blocknr && !sb_rdonly(sb)) { /* - * Alternative boot is ok but primary is not ok. - * Do not update primary boot here 'cause it may be faked boot. - * Let ntfs to be mounted and update boot later. - */ + * Alternative boot is ok but primary is not ok. + * Do not update primary boot here 'cause it may be faked boot. + * Let ntfs to be mounted and update boot later. + */ *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN); } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size > PAGE_SHIFT) { + if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); - u64 lbo = dev_size - sizeof(*boot); + u64 lbo = dev_size0 - sizeof(*boot); /* * Try alternative boot (last sector) @@ -1081,6 +1124,7 @@ out: boot_off = lbo & (block_size - 1); hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ goto check_boot; } brelse(bh); @@ -1369,7 +1413,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) } bytes = inode->i_size; - sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN); + sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL); if (!t) { err = -ENOMEM; goto put_inode_out; @@ -1523,9 +1567,9 @@ load_root: if (boot2) { /* - * Alternative boot is ok but primary is not ok. - * Volume is recognized as NTFS. Update primary boot. - */ + * Alternative boot is ok but primary is not ok. + * Volume is recognized as NTFS. Update primary boot. + */ struct buffer_head *bh0 = sb_getblk(sb, 0); if (bh0) { if (buffer_locked(bh0)) @@ -1564,15 +1608,9 @@ load_root: put_inode_out: iput(inode); out: - /* - * Free resources here. - * ntfs_fs_free will be called with fc->s_fs_info = NULL - */ - put_mount_options(sbi->options); - put_ntfs(sbi); - sb->s_fs_info = NULL; + ntfs3_put_sbi(sbi); kfree(boot2); - + ntfs3_put_sbi(sbi); return err; } @@ -1658,8 +1696,10 @@ static void ntfs_fs_free(struct fs_context *fc) struct ntfs_mount_options *opts = fc->fs_private; struct ntfs_sb_info *sbi = fc->s_fs_info; - if (sbi) - put_ntfs(sbi); + if (sbi) { + ntfs3_put_sbi(sbi); + ntfs3_free_sbi(sbi); + } if (opts) put_mount_options(opts); @@ -1728,13 +1768,24 @@ free_opts: return -ENOMEM; } +static void ntfs3_kill_sb(struct super_block *sb) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + + kill_block_super(sb); + + if (sbi->options) + put_mount_options(sbi->options); + ntfs3_free_sbi(sbi); +} + // clang-format off static struct file_system_type ntfs_fs_type = { .owner = THIS_MODULE, .name = "ntfs3", .init_fs_context = ntfs_init_fs_context, .parameters = ntfs_fs_parameters, - .kill_sb = kill_block_super, + .kill_sb = ntfs3_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on @@ -1753,7 +1804,6 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); - #ifdef CONFIG_PROC_FS /* Create "/proc/fs/ntfs3" */ proc_info_root = proc_mkdir("fs/ntfs3", NULL); @@ -1795,7 +1845,6 @@ static void __exit exit_ntfs_fs(void) if (proc_info_root) remove_proc_entry("fs/ntfs3", NULL); #endif - } MODULE_LICENSE("GPL"); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 023f314e8950..4274b6f31cfa 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -211,7 +211,8 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, size = le32_to_cpu(info->size); /* Enumerate all xattrs. */ - for (ret = 0, off = 0; off < size; off += ea_size) { + ret = 0; + for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) { ea = Add2Ptr(ea_all, off); ea_size = unpacked_ea_size(ea); @@ -219,6 +220,10 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, break; if (buffer) { + /* Check if we can use field ea->name */ + if (off + ea_size > size) + break; + if (ret + ea->name_len + 1 > bytes_per_buffer) { err = -ERANGE; goto out; @@ -637,7 +642,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, if (!err) { set_cached_acl(inode, type, acl); inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } @@ -924,7 +929,7 @@ set_new_fa: NULL); out: - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return err; @@ -1016,7 +1021,7 @@ static const struct xattr_handler ntfs_other_xattr_handler = { .list = ntfs_xattr_user_list, }; -const struct xattr_handler *ntfs_xattr_handlers[] = { +const struct xattr_handler * const ntfs_xattr_handlers[] = { &ntfs_other_xattr_handler, NULL, }; diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 3123da7cfb30..2514d36cbe01 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -2,6 +2,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on INET && SYSFS && CONFIGFS_FS + select BUFFER_HEAD select JBD2 select CRC32 select QUOTA diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 9fd03eaf15f8..62464d194da3 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -191,10 +191,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); di->i_mode = cpu_to_le16(inode->i_mode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 51c93929a146..91b32b2377ac 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -967,7 +967,14 @@ int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) el = &eb->h_list; } - BUG_ON(el->l_tree_depth != 0); + if (el->l_tree_depth != 0) { + retval = ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has leaf extent block %llu with an invalid l_tree_depth of %u\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + (unsigned long long)last_eb_blk, + le16_to_cpu(el->l_tree_depth)); + goto bail; + } retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: @@ -7436,10 +7443,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, } inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); @@ -7642,7 +7649,7 @@ out_mutex: goto next_group; } out: - range->len = trimmed * sb->s_blocksize; + range->len = trimmed * osb->s_clustersize; return ret; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8dfc284e85f0..ba790219d528 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -568,10 +568,10 @@ static void ocfs2_clear_page_regions(struct page *page, * read-in the blocks at the tail of our file. Avoid reading them by * testing i_size against each block offset. */ -static int ocfs2_should_read_blk(struct inode *inode, struct page *page, +static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio, unsigned int block_start) { - u64 offset = page_offset(page) + block_start; + u64 offset = folio_pos(folio) + block_start; if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) return 1; @@ -593,15 +593,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new) { + struct folio *folio = page_folio(page); int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; unsigned int bsize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, bsize, 0); - head = page_buffers(page); for (bh = head, block_start = 0; bh != head || !block_start; bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; @@ -613,7 +614,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * they may belong to unallocated clusters. */ if (block_start >= to || block_end <= from) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); continue; } @@ -630,11 +631,11 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, clean_bdev_bh_alias(bh); } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && - ocfs2_should_read_blk(inode, page, block_start) && + ocfs2_should_read_blk(inode, folio, block_start) && (block_start < from || block_end > to)) { bh_read_nowait(bh, 0); *wait_bh++=bh; @@ -668,7 +669,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (block_start >= to) break; - zero_user(page, block_start, bh->b_size); + folio_zero_range(folio, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -2048,9 +2049,9 @@ out_write_size: } inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); if (handle) ocfs2_update_inode_fsync_trans(handle, inode, 1); } diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 196638a22b48..cdb9b9bdea1f 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -158,7 +158,7 @@ read_failure: if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to - * aovoid bh leak + * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); @@ -345,7 +345,7 @@ read_failure: if (new_bh && bh) { /* If middle bh fails, let previous bh * finish its read and then put it to - * aovoid bh leak + * avoid bh leak */ if (!buffer_jbd(bh)) wait_on_buffer(bh); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 21472e3ed182..4d7efefa98c5 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -213,7 +213,7 @@ struct o2hb_region { unsigned int hr_num_pages; struct page **hr_slot_data; - struct block_device *hr_bdev; + struct bdev_handle *hr_bdev_handle; struct o2hb_disk_slot *hr_slots; /* live node map of this region */ @@ -261,6 +261,11 @@ struct o2hb_region { int hr_last_hb_status; }; +static inline struct block_device *reg_bdev(struct o2hb_region *reg) +{ + return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL; +} + struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; @@ -286,7 +291,7 @@ static void o2hb_write_timeout(struct work_struct *work) hr_write_timeout_work.work); mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " - "milliseconds\n", reg->hr_bdev, + "milliseconds\n", reg_bdev(reg), jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { @@ -383,7 +388,7 @@ static void o2hb_nego_timeout(struct work_struct *work) if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(master_node, reg->hr_nego_node_bitmap); } if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, @@ -398,7 +403,8 @@ static void o2hb_nego_timeout(struct work_struct *work) } printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), + reg_bdev(reg)); /* approve negotiate timeout request. */ o2hb_arm_timeout(reg); @@ -419,7 +425,7 @@ static void o2hb_nego_timeout(struct work_struct *work) /* negotiate timeout with master node. */ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), - reg->hr_bdev, master_node); + reg_bdev(reg), master_node); ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, master_node); if (ret) @@ -436,7 +442,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, nego_msg = (struct o2hb_nego_msg *)msg->buf; printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n", - nego_msg->node_num, config_item_name(®->hr_item), reg->hr_bdev); + nego_msg->node_num, config_item_name(®->hr_item), + reg_bdev(reg)); if (nego_msg->node_num < O2NM_MAX_NODES) set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); else @@ -451,7 +458,7 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, struct o2hb_region *reg = data; printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); o2hb_arm_timeout(reg); return 0; } @@ -515,7 +522,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC); + bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -687,7 +694,7 @@ static int o2hb_check_own_slot(struct o2hb_region *reg) errstr = ERRSTR3; mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " - "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev, + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg), slot->ds_node_num, (unsigned long long)slot->ds_last_generation, (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), @@ -861,7 +868,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) goto unlock; printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -920,7 +927,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, * consider it a transient miss but don't populate any * other values as they may be junk. */ mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n", - slot->ds_node_num, reg->hr_bdev); + slot->ds_node_num, reg_bdev(reg)); o2hb_dump_slot(hb_block); slot->ds_equal_samples++; @@ -1003,8 +1010,8 @@ fire_callbacks: "of %u ms, but our count is %u ms.\n" "Please double check your configuration values " "for 'O2CB_HEARTBEAT_THRESHOLD'\n", - slot->ds_node_num, reg->hr_bdev, slot_dead_ms, - dead_ms); + slot->ds_node_num, reg_bdev(reg), + slot_dead_ms, dead_ms); } goto out; } @@ -1143,7 +1150,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * can't be sure that the new block ever made it to * disk */ mlog(ML_ERROR, "Write error %d on device \"%pg\"\n", - write_wc.wc_error, reg->hr_bdev); + write_wc.wc_error, reg_bdev(reg)); ret = write_wc.wc_error; goto bail; } @@ -1169,7 +1176,7 @@ bail: printk(KERN_NOTICE "o2hb: Unable to stabilize " "heartbeat on region %s (%pg)\n", config_item_name(®->hr_item), - reg->hr_bdev); + reg_bdev(reg)); atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); @@ -1489,7 +1496,7 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); - mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev); + mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg)); kfree(reg->hr_tmp_block); @@ -1502,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slot_data); } - if (reg->hr_bdev) - blkdev_put(reg->hr_bdev, NULL); + if (reg->hr_bdev_handle) + bdev_release(reg->hr_bdev_handle); kfree(reg->hr_slots); @@ -1562,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, unsigned long block_bytes; unsigned int block_bits; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) return -EINVAL; status = o2hb_read_block_input(reg, page, &block_bytes, @@ -1591,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item, char *p = (char *)page; ssize_t ret; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) return -EINVAL; ret = kstrtoull(p, 0, &tmp); @@ -1616,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item, unsigned long tmp; char *p = (char *)page; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) return -EINVAL; tmp = simple_strtoul(p, &p, 0); @@ -1635,8 +1642,8 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (to_o2hb_region(item)->hr_bdev) - ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev); + if (to_o2hb_region(item)->hr_bdev_handle) + ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item))); return ret; } @@ -1745,7 +1752,10 @@ out: return ret; } -/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ +/* + * this is acting as commit; we set up all of hr_bdev_handle and hr_task or + * nothing + */ static ssize_t o2hb_region_dev_store(struct config_item *item, const char *page, size_t count) @@ -1759,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, ssize_t ret = -EINVAL; int live_threshold; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) goto out; /* We can't heartbeat without having had our node number @@ -1785,16 +1795,15 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (!S_ISBLK(f.file->f_mapping->host->i_mode)) goto out2; - reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, - BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, - NULL); - if (IS_ERR(reg->hr_bdev)) { - ret = PTR_ERR(reg->hr_bdev); - reg->hr_bdev = NULL; + reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev, + BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(reg->hr_bdev_handle)) { + ret = PTR_ERR(reg->hr_bdev_handle); + reg->hr_bdev_handle = NULL; goto out2; } - sectsize = bdev_logical_block_size(reg->hr_bdev); + sectsize = bdev_logical_block_size(reg_bdev(reg)); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, "blocksize %u incorrect for device, expected %d", @@ -1890,12 +1899,12 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (hb_task && o2hb_global_heartbeat_active()) printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); out3: if (ret < 0) { - blkdev_put(reg->hr_bdev, NULL); - reg->hr_bdev = NULL; + bdev_release(reg->hr_bdev_handle); + reg->hr_bdev_handle = NULL; } out2: fdput(f); @@ -2085,7 +2094,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n", ((atomic_read(®->hr_steady_iterations) == 0) ? "stopped" : "start aborted"), config_item_name(item), - reg->hr_bdev); + reg_bdev(reg)); } /* diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 35c05c18de59..bc27301eab6d 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -44,17 +44,17 @@ static LIST_HEAD(send_tracking); void o2net_debug_add_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&nst->st_net_debug_item, &send_tracking); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); if (!list_empty(&nst->st_net_debug_item)) list_del_init(&nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } static struct o2net_send_tracking @@ -84,9 +84,9 @@ static void *nst_seq_start(struct seq_file *seq, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; } @@ -95,13 +95,13 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); list_del_init(&dummy_nst->st_net_debug_item); if (nst) list_add(&dummy_nst->st_net_debug_item, &nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; /* unused, just needs to be null when done */ } @@ -112,7 +112,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) ktime_t now; s64 sock, send, status; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); if (!nst) goto out; @@ -145,7 +145,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) (long long)status); out: - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } @@ -191,16 +191,16 @@ static const struct file_operations nst_seq_fops = { void o2net_debug_add_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&sc->sc_net_debug_item, &sock_containers); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_del_init(&sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } struct o2net_sock_debug { @@ -236,9 +236,9 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; } @@ -248,12 +248,12 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); list_del_init(&dummy_sc->sc_net_debug_item); if (sc) list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; /* unused, just needs to be null when done */ } @@ -349,7 +349,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); if (sc) { @@ -359,7 +359,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) sc_show_sock_stats(seq, sc); } - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 189c111bc371..15d0ed9c13e5 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -93,7 +93,7 @@ static void o2quo_make_decision(struct work_struct *work) int lowest_hb, lowest_reachable = 0, fence = 0; struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); if (lowest_hb != O2NM_MAX_NODES) @@ -146,14 +146,14 @@ static void o2quo_make_decision(struct work_struct *work) out: if (fence) { - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); o2quo_fence_self(); } else { mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " "connected: %d, lowest: %d (%sreachable)\n", qs->qs_heartbeating, qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } @@ -196,7 +196,7 @@ void o2quo_hb_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating++; mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, @@ -211,7 +211,7 @@ void o2quo_hb_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* hb going down releases any holds we might have had due to this node from @@ -220,7 +220,7 @@ void o2quo_hb_down(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating--; mlog_bug_on_msg(qs->qs_heartbeating < 0, @@ -233,7 +233,7 @@ void o2quo_hb_down(u8 node) o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* this tells us that we've decided that the node is still heartbeating @@ -245,14 +245,14 @@ void o2quo_hb_still_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); mlog(0, "node %u\n", node); qs->qs_pending = 1; o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* This is analogous to hb_up. as a node's connection comes up we delay the @@ -264,7 +264,7 @@ void o2quo_conn_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_connected++; mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, @@ -279,7 +279,7 @@ void o2quo_conn_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* we've decided that we won't ever be connecting to the node again. if it's @@ -290,7 +290,7 @@ void o2quo_conn_err(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); if (test_bit(node, qs->qs_conn_bm)) { qs->qs_connected--; @@ -307,7 +307,7 @@ void o2quo_conn_err(u8 node) mlog(0, "node %u, %d total\n", node, qs->qs_connected); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } void o2quo_init(void) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 694471fc46b8..a14c8fee6ee5 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1658,7 +1658,8 @@ int __ocfs2_add_entry(handle_t *handle, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); @@ -2962,11 +2963,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); di->i_size = cpu_to_le64(sb->s_blocksize); - di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir)); ocfs2_update_inode_fsync_trans(handle, dir, 1); /* diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index ba26c5567cff..85215162c9dd 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -80,8 +80,7 @@ static int param_set_dlmfs_capabilities(const char *val, static int param_get_dlmfs_capabilities(char *buffer, const struct kernel_param *kp) { - return strlcpy(buffer, DLMFS_CAPABILITIES, - strlen(DLMFS_CAPABILITIES) + 1); + return sysfs_emit(buffer, DLMFS_CAPABILITIES); } module_param_call(capabilities, param_set_dlmfs_capabilities, param_get_dlmfs_capabilities, NULL, 0444); @@ -337,7 +336,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inc_nlink(inode); inode->i_fop = &simple_dir_operations; @@ -360,7 +359,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, parent, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); ip = DLMFS_I(inode); ip->ip_conn = DLMFS_I(parent)->ip_conn; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c28bc983a7b1..64a6ef638495 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2162,6 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ts; lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2182,12 +2183,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); - lvb->lvb_iatime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); - lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); - lvb->lvb_imtime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + ts = inode_get_atime(inode); + lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_ctime(inode); + lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_mtime(inode); + lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); @@ -2208,6 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ts; mlog_meta_lvb(0, lockres); @@ -2234,12 +2236,12 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); - ocfs2_unpack_timespec(&inode->i_atime, - be64_to_cpu(lvb->lvb_iatime_packed)); - ocfs2_unpack_timespec(&inode->i_mtime, - be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&inode->i_ctime, - be64_to_cpu(lvb->lvb_ictime_packed)); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed)); + inode_set_atime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed)); + inode_set_mtime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed)); + inode_set_ctime_to_ts(inode, ts); spin_unlock(&oi->ip_lock); return 0; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bf2c17ea96a0..94e2a1244442 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -232,15 +232,19 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if (vfsmnt->mnt_flags & MNT_RELATIME) { - if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 atime = inode_get_atime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + + if ((timespec64_compare(&atime, &mtime) <= 0) || + (timespec64_compare(&atime, &ctime) <= 0)) return 1; return 0; } now = current_time(inode); - if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) + if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum)) return 0; else return 1; @@ -273,9 +277,9 @@ int ocfs2_update_inode_atime(struct inode *inode, * have i_rwsem to guard against concurrent changes to other * inode fields. */ - inode->i_atime = current_time(inode); - di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + inode_set_atime_to_ts(inode, current_time(inode)); + di->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -294,7 +298,7 @@ int ocfs2_set_inode_size(handle_t *handle, i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { @@ -415,12 +419,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); di = (struct ocfs2_dinode *) fe_bh->b_data; di->i_size = cpu_to_le64(new_i_size); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -808,12 +812,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - ret = block_commit_write(page, block_start + 1, - block_start + 1); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + block_commit_write(page, block_start + 1, block_start + 1); } /* @@ -824,9 +823,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); di->i_mtime_nsec = di->i_ctime_nsec; if (handle) { ocfs2_journal_dirty(handle, di_bh); @@ -1317,7 +1316,7 @@ int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, goto bail; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -2043,7 +2042,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bb116c39b581..999111bfc271 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -302,12 +302,12 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; } - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) mlog(ML_ERROR, @@ -1312,12 +1312,12 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_uid = cpu_to_le32(i_uid_read(inode)); fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); - fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ocfs2_journal_dirty(handle, bh); ocfs2_update_inode_fsync_trans(handle, inode, 1); @@ -1348,12 +1348,12 @@ void ocfs2_refresh_inode(struct inode *inode, inode->i_blocks = 0; else inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); spin_unlock(&OCFS2_I(inode)->ip_lock); } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 25d8072ccfce..604fea3a26ff 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -90,7 +90,7 @@ enum ocfs2_replay_state { struct ocfs2_replay_map { unsigned int rm_slots; enum ocfs2_replay_state rm_state; - unsigned char rm_replay_slots[]; + unsigned char rm_replay_slots[] __counted_by(rm_slots); }; static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) @@ -114,9 +114,9 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) if (osb->replay_map) return 0; - replay_map = kzalloc(sizeof(struct ocfs2_replay_map) + - (osb->max_slots * sizeof(char)), GFP_KERNEL); - + replay_map = kzalloc(struct_size(replay_map, rm_replay_slots, + osb->max_slots), + GFP_KERNEL); if (!replay_map) { mlog_errno(-ENOMEM); return -ENOMEM; @@ -178,16 +178,13 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); - rm = kzalloc(sizeof(struct ocfs2_recovery_map) + - osb->max_slots * sizeof(unsigned int), + rm = kzalloc(struct_size(rm, rm_entries, osb->max_slots), GFP_KERNEL); if (!rm) { mlog_errno(-ENOMEM); return -ENOMEM; } - rm->rm_entries = (unsigned int *)((char *)rm + - sizeof(struct ocfs2_recovery_map)); osb->recovery_map = rm; return 0; @@ -557,7 +554,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, (unsigned long)bh, (unsigned long long)bh->b_blocknr); - ocfs2_error(bh->b_bdev->bd_super, + ocfs2_error(bh->b_assoc_map->host->i_sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } @@ -780,14 +777,14 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) mlog_errno(status); if (!is_handle_aborted(handle)) { journal_t *journal = handle->h_transaction->t_journal; - struct super_block *sb = bh->b_bdev->bd_super; mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " "Aborting transaction and journal.\n"); handle->h_err = status; jbd2_journal_abort_handle(handle); jbd2_journal_abort(journal, status); - ocfs2_abort(sb, "Journal already aborted.\n"); + ocfs2_abort(bh->b_assoc_map->host->i_sb, + "Journal already aborted.\n"); } } } @@ -911,9 +908,9 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); - if (j_journal == NULL) { + if (IS_ERR(j_journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EINVAL; + status = PTR_ERR(j_journal); goto done; } @@ -1687,9 +1684,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } journal = jbd2_journal_init_inode(inode); - if (journal == NULL) { + if (IS_ERR(journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EIO; + status = PTR_ERR(journal); goto done; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 41c382f68529..41c9fe7e62f9 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -29,7 +29,7 @@ struct ocfs2_dinode; struct ocfs2_recovery_map { unsigned int rm_used; - unsigned int *rm_entries; + unsigned int rm_entries[]; }; diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index b1e32ec4a9d4..1f9ed117e78b 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -950,9 +950,9 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } di = (struct ocfs2_dinode *)di_bh->b_data; - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 17c52225b87d..814733ba2f4b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -793,10 +793,10 @@ static int ocfs2_link(struct dentry *old_dentry, } inc_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir, ocfs2_set_links_count(fe, inode->i_nlink); ocfs2_journal_dirty(handle, fe_bh); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (S_ISDIR(inode->i_mode)) drop_nlink(dir); @@ -1535,9 +1535,13 @@ static int ocfs2_rename(struct mnt_idmap *idmap, status = ocfs2_add_entry(handle, new_dentry, old_inode, OCFS2_I(old_inode)->ip_blkno, new_dir_bh, &target_insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } } - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), @@ -1546,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; - old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); - old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode)); + old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode)); ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1586,13 +1590,17 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, &old_inode_dot_dot_res, new_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } drop_nlink(old_dir); if (new_inode) { drop_nlink(new_inode); @@ -1610,7 +1618,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (old_dir != new_dir) { /* Keep the same times on both directories.*/ - new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + inode_set_mtime_to_ts(new_dir, + inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir))); /* * This will also pick up the i_nlink change from the @@ -1631,6 +1640,10 @@ static int ocfs2_rename(struct mnt_idmap *idmap, INODE_CACHE(old_dir), old_dir_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } fe = (struct ocfs2_dinode *) old_dir_bh->b_data; ocfs2_set_links_count(fe, old_dir->i_nlink); ocfs2_journal_dirty(handle, old_dir_bh); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index dfaae1e52412..e09842fc9d4d 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1240,6 +1240,10 @@ int ocfs2_create_local_dquot(struct dquot *dquot) &od->dq_local_phys_blk, &pcount, NULL); + if (status < 0) { + mlog_errno(status); + goto out; + } /* Initialize dquot structure on disk */ status = ocfs2_local_write_dquot(dquot); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 564ab48d03ef..3f80a56d0d60 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3750,9 +3750,9 @@ static int ocfs2_change_ctime(struct inode *inode, goto out_commit; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(handle, di_bh); @@ -4073,12 +4073,12 @@ static int ocfs2_complete_reflink(struct inode *s_inode, * we want mtime to appear identical to the source and * update ctime. */ - t_inode->i_ctime = current_time(t_inode); + inode_set_ctime_current(t_inode); - di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode)); - t_inode->i_mtime = s_inode->i_mtime; + inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode)); di->i_mtime = s_di->i_mtime; di->i_mtime_nsec = s_di->i_mtime_nsec; } @@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest, if (newlen > i_size_read(dest)) i_size_write(dest, newlen); spin_unlock(&OCFS2_I(dest)->ip_lock); - dest->i_ctime = dest->i_mtime = current_time(dest); + inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest)); ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); if (ret) { diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index da7718cef735..e544c704b583 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -37,7 +37,7 @@ struct ocfs2_slot_info { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - struct ocfs2_slot si_slots[]; + struct ocfs2_slot si_slots[] __counted_by(si_num_slots); }; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 05d4414d0c33..9b76ee66aeb2 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -738,18 +738,11 @@ static int user_plock(struct ocfs2_cluster_connection *conn, * * Internally, fs/dlm will pass these to a misc device, which * a userspace daemon will read and write to. - * - * For now, cancel requests (which happen internally only), - * are turned into unlocks. Most of this function taken from - * gfs2_lock. */ - if (cmd == F_CANCELLK) { - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } - - if (IS_GETLK(cmd)) + if (cmd == F_CANCELLK) + return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl); + else if (IS_GETLK(cmd)) return dlm_posix_get(conn->cc_lockspace, ino, file, fl); else if (fl->fl_type == F_UNLCK) return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 988d1c076861..6b906424902b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1517,8 +1517,7 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, - OCFS2_STACK_LABEL_LEN); + seq_show_option(s, "cluster_stack", osb->osb_cluster_stack); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ac77ff6e676..3b81213ed7b8 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -87,14 +87,14 @@ static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), }; -const struct xattr_handler *ocfs2_xattr_handlers[] = { +const struct xattr_handler * const ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL }; -static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { +static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, @@ -3421,9 +3421,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); } out: diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f..65e9aa743919 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -30,7 +30,7 @@ struct ocfs2_security_xattr_info { extern const struct xattr_handler ocfs2_xattr_user_handler; extern const struct xattr_handler ocfs2_xattr_trusted_handler; extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler *ocfs2_xattr_handlers[]; +extern const struct xattr_handler * const ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig index 42b2ec35a05b..8470f6c3e64e 100644 --- a/fs/omfs/Kconfig +++ b/fs/omfs/Kconfig @@ -2,6 +2,7 @@ config OMFS_FS tristate "SonicBlue Optimized MPEG File System support" depends on BLOCK + select BUFFER_HEAD select CRC_ITU_T help This is the proprietary file system used by the Rio Karma music diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 82cf7e9a665f..6bda275826d6 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -143,7 +143,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode) mark_buffer_dirty(bh); brelse(bh); - dir->i_ctime = current_time(dir); + inode_set_ctime_current(dir); /* mark affected inodes dirty to rebuild checksums */ mark_inode_dirty(dir); @@ -399,7 +399,7 @@ static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (err) goto out; - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); out: return err; diff --git a/fs/omfs/file.c b/fs/omfs/file.c index de8f57ee39ec..6b580b9da8e3 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -14,7 +14,7 @@ static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) { return (sbi->s_sys_blocksize - offset - sizeof(struct omfs_extent)) / - sizeof(struct omfs_extent_entry) + 1; + sizeof(struct omfs_extent_entry); } void omfs_make_empty_table(struct buffer_head *bh, int offset) @@ -24,8 +24,8 @@ void omfs_make_empty_table(struct buffer_head *bh, int offset) oe->e_next = ~cpu_to_be64(0ULL); oe->e_extent_count = cpu_to_be32(1), oe->e_fill = cpu_to_be32(0x22), - oe->e_entry.e_cluster = ~cpu_to_be64(0ULL); - oe->e_entry.e_blocks = ~cpu_to_be64(0ULL); + oe->e_entry[0].e_cluster = ~cpu_to_be64(0ULL); + oe->e_entry[0].e_blocks = ~cpu_to_be64(0ULL); } int omfs_shrink_inode(struct inode *inode) @@ -68,7 +68,7 @@ int omfs_shrink_inode(struct inode *inode) last = next; next = be64_to_cpu(oe->e_next); - entry = &oe->e_entry; + entry = oe->e_entry; /* ignore last entry as it is the terminator */ for (; extent_count > 1; extent_count--) { @@ -117,7 +117,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe, u64 *ret_block) { struct omfs_extent_entry *terminator; - struct omfs_extent_entry *entry = &oe->e_entry; + struct omfs_extent_entry *entry = oe->e_entry; struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb); u32 extent_count = be32_to_cpu(oe->e_extent_count); u64 new_block = 0; @@ -245,7 +245,7 @@ static int omfs_get_block(struct inode *inode, sector_t block, extent_count = be32_to_cpu(oe->e_extent_count); next = be64_to_cpu(oe->e_next); - entry = &oe->e_entry; + entry = oe->e_entry; if (extent_count > max_extents) goto out_brelse; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index c4c79e07efc7..d6cd81163030 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; @@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait) oi->i_head.h_magic = OMFS_IMAGIC; oi->i_size = cpu_to_be64(inode->i_size); - ctime = inode->i_ctime.tv_sec * 1000LL + - ((inode->i_ctime.tv_nsec + 999)/1000); + ctime = inode_get_ctime_sec(inode) * 1000LL + + ((inode_get_ctime_nsec(inode) + 999)/1000); oi->i_ctime = cpu_to_be64(ctime); omfs_update_checksums(oi); @@ -230,12 +230,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) ctime = be64_to_cpu(oi->i_ctime); nsecs = do_div(ctime, 1000) * 1000L; - inode->i_atime.tv_sec = ctime; - inode->i_mtime.tv_sec = ctime; - inode->i_ctime.tv_sec = ctime; - inode->i_atime.tv_nsec = nsecs; - inode->i_mtime.tv_nsec = nsecs; - inode->i_ctime.tv_nsec = nsecs; + inode_set_atime(inode, ctime, nsecs); + inode_set_mtime(inode, ctime, nsecs); + inode_set_ctime(inode, ctime, nsecs); inode->i_mapping->a_ops = &omfs_aops; diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h index caecb3d5a344..1ff6b9e41297 100644 --- a/fs/omfs/omfs_fs.h +++ b/fs/omfs/omfs_fs.h @@ -77,7 +77,7 @@ struct omfs_extent { __be64 e_next; /* next extent table location */ __be32 e_extent_count; /* total # extents in this table */ __be32 e_fill; - struct omfs_extent_entry e_entry; /* start of extent entries */ + struct omfs_extent_entry e_entry[]; /* start of extent entries */ }; #endif diff --git a/fs/open.c b/fs/open.c index e6ead0f19964..3494a9cd8046 100644 --- a/fs/open.c +++ b/fs/open.c @@ -671,11 +671,20 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) return err; } -static int do_fchmodat(int dfd, const char __user *filename, umode_t mode) +static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, + unsigned int flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; + unsigned int lookup_flags; + + if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) + return -EINVAL; + + lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { @@ -689,15 +698,21 @@ retry: return error; } +SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, + umode_t, mode, unsigned int, flags) +{ + return do_fchmodat(dfd, filename, mode, flags); +} + SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { - return do_fchmodat(dfd, filename, mode); + return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return do_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* @@ -855,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; + } + return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) @@ -877,14 +916,9 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = get_write_access(inode); + error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; - error = __mnt_want_write(f->f_path.mnt); - if (unlikely(error)) { - put_write_access(inode); - goto cleanup_file; - } f->f_mode |= FMODE_WRITER; } @@ -1054,8 +1088,6 @@ struct file *dentry_open(const struct path *path, int flags, int error; struct file *f; - validate_creds(cred); - /* We must always pass in a valid mount pointer. */ BUG_ON(!path->mnt); @@ -1094,7 +1126,6 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, struct file *f; int error; - validate_creds(cred); f = alloc_empty_file(flags, cred); if (IS_ERR(f)) return f; @@ -1148,20 +1179,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open); /** * backing_file_open - open a backing file for kernel internal use - * @path: path of the file to open + * @user_path: path that the user reuqested to open * @flags: open flags - * @path: path of the backing file + * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @path may be on the stackable filesystem and backing inode on the - * underlying filesystem. In this case, we want to be able to return - * the @real_path of the backing inode. This is done by embedding the - * returned file into a container structure that also stores the path of - * the backing inode on the underlying filesystem, which can be - * retrieved using backing_file_real_path(). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). */ -struct file *backing_file_open(const struct path *path, int flags, +struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred) { @@ -1172,9 +1202,9 @@ struct file *backing_file_open(const struct path *path, int flags, if (IS_ERR(f)) return f; - f->f_path = *path; - path_get(real_path); - *backing_file_real_path(f) = *real_path; + path_get(user_path); + *backing_file_user_path(f) = *user_path; + f->f_path = *real_path; error = do_dentry_open(f, d_inode(real_path->dentry), NULL); if (error) { fput(f); @@ -1503,7 +1533,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) * "id" is the POSIX thread ID. We use the * files pointer for this.. */ -int filp_close(struct file *filp, fl_owner_t id) +static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; @@ -1520,10 +1550,18 @@ int filp_close(struct file *filp, fl_owner_t id) dnotify_flush(filp, id); locks_remove_posix(filp, id); } - fput(filp); return retval; } +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + retval = filp_flush(filp, id); + fput(filp); + + return retval; +} EXPORT_SYMBOL(filp_close); /* @@ -1533,7 +1571,20 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = close_fd(fd); + int retval; + struct file *file; + + file = close_fd_get_file(fd); + if (!file) + return -EBADF; + + retval = filp_flush(file, current->files); + + /* + * We're returning to user space. Don't bother + * with any delayed fput() cases. + */ + __fput_sync(file); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || @@ -1546,7 +1597,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) } /** - * close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index f0b7f4d51a17..c4b65a6d41cc 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -237,7 +237,7 @@ found: if (IS_ERR(inode)) return ERR_CAST(inode); if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); ent_oi = OP_I(inode); ent_oi->type = ent_type; ent_oi->u = ent_data; @@ -387,8 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc) goto out_no_root; } - root_inode->i_mtime = root_inode->i_atime = - root_inode->i_ctime = current_time(root_inode); + simple_inode_init_ts(root_inode); root_inode->i_op = &openprom_inode_operations; root_inode->i_fop = &openprom_operations; root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 9014bbcc8031..085912268442 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -871,7 +871,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -900,12 +900,13 @@ int orangefs_permission(struct mnt_idmap *idmap, return generic_permission(&nop_mnt_idmap, inode, mask); } -int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) +int orangefs_update_time(struct inode *inode, int flags) { struct iattr iattr; + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", get_khandle_from_ino(inode)); - generic_update_time(inode, time, flags); + flags = generic_update_time(inode, flags); memset(&iattr, 0, sizeof iattr); if (flags & S_ATIME) iattr.ia_valid |= ATTR_ATIME; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 77518e248cf7..c9dfd5c6a097 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -421,7 +421,7 @@ static int orangefs_rename(struct mnt_idmap *idmap, ret); if (new_dentry->d_inode) - new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode); + inode_set_ctime_current(d_inode(new_dentry)); op_release(new_op); return ret; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index ce20d3443869..926d9c0a428a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -103,7 +103,7 @@ enum orangefs_vfs_op_states { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif -extern const struct xattr_handler *orangefs_xattr_handlers[]; +extern const struct xattr_handler * const orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct mnt_idmap *idmap, @@ -370,7 +370,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int orangefs_update_time(struct inode *, struct timespec64 *, int); +int orangefs_update_time(struct inode *, int); /* * defined in xattr.c diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 46b7dcff18ac..0fdceb00ca07 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -155,14 +155,14 @@ static inline void copy_attributes_from_inode(struct inode *inode, if (orangefs_inode->attr_valid & ATTR_ATIME) { attrs->mask |= ORANGEFS_ATTR_SYS_ATIME; if (orangefs_inode->attr_valid & ATTR_ATIME_SET) { - attrs->atime = (time64_t)inode->i_atime.tv_sec; + attrs->atime = (time64_t) inode_get_atime_sec(inode); attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET; } } if (orangefs_inode->attr_valid & ATTR_MTIME) { attrs->mask |= ORANGEFS_ATTR_SYS_MTIME; if (orangefs_inode->attr_valid & ATTR_MTIME_SET) { - attrs->mtime = (time64_t)inode->i_mtime.tv_sec; + attrs->mtime = (time64_t) inode_get_mtime_sec(inode); attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET; } } @@ -357,15 +357,15 @@ again2: downcall.resp.getattr.attributes.owner); inode->i_gid = make_kgid(&init_user_ns, new_op-> downcall.resp.getattr.attributes.group); - inode->i_atime.tv_sec = (time64_t)new_op-> - downcall.resp.getattr.attributes.atime; - inode->i_mtime.tv_sec = (time64_t)new_op-> - downcall.resp.getattr.attributes.mtime; - inode->i_ctime.tv_sec = (time64_t)new_op-> - downcall.resp.getattr.attributes.ctime; - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; + inode_set_atime(inode, + (time64_t)new_op->downcall.resp.getattr.attributes.atime, + 0); + inode_set_mtime(inode, + (time64_t)new_op->downcall.resp.getattr.attributes.mtime, + 0); + inode_set_ctime(inode, + (time64_t)new_op->downcall.resp.getattr.attributes.ctime, + 0); /* special case: mark the root inode as sticky */ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 68b62689a63e..74ef75586f38 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -554,7 +554,7 @@ static const struct xattr_handler orangefs_xattr_default_handler = { .set = orangefs_xattr_set_default, }; -const struct xattr_handler *orangefs_xattr_handlers[] = { +const struct xattr_handler * const orangefs_xattr_handlers[] = { &orangefs_xattr_default_handler, NULL }; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 6708e54b0e30..fec5020c3495 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -124,3 +124,12 @@ config OVERLAY_FS_METACOPY that doesn't support this feature will have unexpected results. If unsure, say N. + +config OVERLAY_FS_DEBUG + bool "Overlayfs: turn on extra debugging checks" + default n + depends on OVERLAY_FS + help + Say Y here to enable extra debugging checks in overlayfs. + + If unsure, say N. diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 4e173d56b11f..5648954f8588 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_OVERLAY_FS) += overlay.o overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \ - copy_up.o export.o params.o + copy_up.o export.o params.o xattrs.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 568f743a5584..8bea66c97316 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -252,7 +252,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, return PTR_ERR(old_file); /* Try to use clone_file_range to clone up within the same fs */ + ovl_start_write(dentry); cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0); + ovl_end_write(dentry); if (cloned == len) goto out_fput; /* Couldn't clone, so now we try to copy the data */ @@ -287,8 +289,12 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, * it may not recognize all kind of holes and sometimes * only skips partial of hole area. However, it will be * enough for most of the use cases. + * + * We do not hold upper sb_writers throughout the loop to avert + * lockdep warning with llseek of lower file in nested overlay: + * - upper sb_writers + * -- lower ovl_inode_lock (ovl_llseek) */ - if (skip_hole && data_pos < old_pos) { data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA); if (data_pos > old_pos) { @@ -303,9 +309,11 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry, } } + ovl_start_write(dentry); bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); + ovl_end_write(dentry); if (bytes <= 0) { error = bytes; break; @@ -337,7 +345,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry, { struct iattr attr = { .ia_valid = - ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME, .ia_atime = stat->atime, .ia_mtime = stat->mtime, }; @@ -416,7 +424,7 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, if (is_upper) fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; fh->fb.len = sizeof(fh->fb) + buflen; - if (ofs->config.uuid) + if (ovl_origin_uuid(ofs)) fh->fb.uuid = *uuid; return fh; @@ -426,29 +434,29 @@ out_err: return ERR_PTR(err); } -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, - struct dentry *upper) +struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin) { - const struct ovl_fh *fh = NULL; - int err; - /* * When lower layer doesn't support export operations store a 'null' fh, * so we can use the overlay.origin xattr to distignuish between a copy * up and a pure upper inode. */ - if (ovl_can_decode_fh(lower->d_sb)) { - fh = ovl_encode_real_fh(ofs, lower, false); - if (IS_ERR(fh)) - return PTR_ERR(fh); - } + if (!ovl_can_decode_fh(origin->d_sb)) + return NULL; + + return ovl_encode_real_fh(ofs, origin, false); +} + +int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, + struct dentry *upper) +{ + int err; /* * Do not fail when upper doesn't support xattrs. */ err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf, fh ? fh->fb.len : 0, 0); - kfree(fh); /* Ignore -EPERM from setting "user.*" on symlink/special */ return err == -EPERM ? 0 : err; @@ -476,7 +484,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper, * * Caller must hold i_mutex on indexdir. */ -static int ovl_create_index(struct dentry *dentry, struct dentry *origin, +static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh, struct dentry *upper) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); @@ -502,7 +510,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry)))) return -EIO; - err = ovl_get_index_name(ofs, origin, &name); + err = ovl_get_index_name_fh(fh, &name); if (err) return err; @@ -541,9 +549,11 @@ struct ovl_copy_up_ctx { struct dentry *destdir; struct qstr destname; struct dentry *workdir; + const struct ovl_fh *origin_fh; bool origin; bool indexed; bool metacopy; + bool metacopy_digest; }; static int ovl_link_up(struct ovl_copy_up_ctx *c) @@ -554,14 +564,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); struct inode *udir = d_inode(upperdir); + ovl_start_write(c->dentry); + /* Mark parent "impure" because it may now contain non-pure upper */ err = ovl_set_impure(c->parent, upperdir); if (err) - return err; + goto out; err = ovl_set_nlink_lower(c->dentry); if (err) - return err; + goto out; inode_lock_nested(udir, I_MUTEX_PARENT); upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir, @@ -580,10 +592,12 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) } inode_unlock(udir); if (err) - return err; + goto out; err = ovl_set_nlink_upper(c->dentry); +out: + ovl_end_write(c->dentry); return err; } @@ -617,7 +631,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; - if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) { + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { /* * Copy the fileattr inode flags that are the source of already * copied i_flags @@ -635,14 +650,26 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) * hard link. */ if (c->origin) { - err = ovl_set_origin(ofs, c->lowerpath.dentry, temp); + err = ovl_set_origin_fh(ofs, c->origin_fh, temp); if (err) return err; } if (c->metacopy) { - err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY, - NULL, 0, -EOPNOTSUPP); + struct path lowerdatapath; + struct ovl_metacopy metacopy_data = OVL_METACOPY_INIT; + + ovl_path_lowerdata(c->dentry, &lowerdatapath); + if (WARN_ON_ONCE(lowerdatapath.dentry == NULL)) + return -EIO; + err = ovl_get_verity_digest(ofs, &lowerdatapath, &metacopy_data); + if (err) + return err; + + if (metacopy_data.digest_algo) + c->metacopy_digest = true; + + err = ovl_set_metacopy_xattr(ofs, temp, &metacopy_data); if (err) return err; } @@ -705,21 +732,19 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) .link = c->link }; - /* workdir and destdir could be the same when copying up to indexdir */ - err = -EIO; - if (lock_rename(c->workdir, c->destdir) != NULL) - goto unlock; - err = ovl_prep_cu_creds(c->dentry, &cc); if (err) - goto unlock; + return err; + ovl_start_write(c->dentry); + inode_lock(wdir); temp = ovl_create_temp(ofs, c->workdir, &cattr); + inode_unlock(wdir); + ovl_end_write(c->dentry); ovl_revert_cu_creds(&cc); - err = PTR_ERR(temp); if (IS_ERR(temp)) - goto unlock; + return PTR_ERR(temp); /* * Copy up data first and then xattrs. Writing data after @@ -727,15 +752,29 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) */ path.dentry = temp; err = ovl_copy_up_data(c, &path); - if (err) + /* + * We cannot hold lock_rename() throughout this helper, because of + * lock ordering with sb_writers, which shouldn't be held when calling + * ovl_copy_up_data(), so lock workdir and destdir and make sure that + * temp wasn't moved before copy up completion or cleanup. + */ + ovl_start_write(c->dentry); + if (lock_rename(c->workdir, c->destdir) != NULL || + temp->d_parent != c->workdir) { + /* temp or workdir moved underneath us? abort without cleanup */ + dput(temp); + err = -EIO; + goto unlock; + } else if (err) { goto cleanup; + } err = ovl_copy_up_metadata(c, temp); if (err) goto cleanup; if (S_ISDIR(c->stat.mode) && c->indexed) { - err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); + err = ovl_create_index(c->dentry, c->origin_fh, temp); if (err) goto cleanup; } @@ -751,14 +790,21 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c) if (err) goto cleanup; - if (!c->metacopy) - ovl_set_upperdata(d_inode(c->dentry)); inode = d_inode(c->dentry); + if (c->metacopy_digest) + ovl_set_flag(OVL_HAS_DIGEST, inode); + else + ovl_clear_flag(OVL_HAS_DIGEST, inode); + ovl_clear_flag(OVL_VERIFIED_DIGEST, inode); + + if (!c->metacopy) + ovl_set_upperdata(inode); ovl_inode_update(inode, temp); if (S_ISDIR(inode->i_mode)) ovl_set_flag(OVL_WHITEOUTS, inode); unlock: unlock_rename(c->workdir, c->destdir); + ovl_end_write(c->dentry); return err; @@ -782,9 +828,10 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) if (err) return err; + ovl_start_write(c->dentry); tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode); + ovl_end_write(c->dentry); ovl_revert_cu_creds(&cc); - if (IS_ERR(tmpfile)) return PTR_ERR(tmpfile); @@ -795,9 +842,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) goto out_fput; } + ovl_start_write(c->dentry); + err = ovl_copy_up_metadata(c, temp); if (err) - goto out_fput; + goto out; inode_lock_nested(udir, I_MUTEX_PARENT); @@ -811,12 +860,20 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c) inode_unlock(udir); if (err) - goto out_fput; + goto out; + + if (c->metacopy_digest) + ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); + else + ovl_clear_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); + ovl_clear_flag(OVL_VERIFIED_DIGEST, d_inode(c->dentry)); if (!c->metacopy) ovl_set_upperdata(d_inode(c->dentry)); ovl_inode_update(d_inode(c->dentry), dget(temp)); +out: + ovl_end_write(c->dentry); out_fput: fput(tmpfile); return err; @@ -835,6 +892,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) { int err; struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct dentry *origin = c->lowerpath.dentry; + struct ovl_fh *fh = NULL; bool to_index = false; /* @@ -851,25 +910,35 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) to_index = true; } - if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) + if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) { + fh = ovl_get_origin_fh(ofs, origin); + if (IS_ERR(fh)) + return PTR_ERR(fh); + + /* origin_fh may be NULL */ + c->origin_fh = fh; c->origin = true; + } if (to_index) { c->destdir = ovl_indexdir(c->dentry->d_sb); - err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname); + err = ovl_get_index_name(ofs, origin, &c->destname); if (err) - return err; + goto out_free_fh; } else if (WARN_ON(!c->parent)) { /* Disconnected dentry must be copied up to index dir */ - return -EIO; + err = -EIO; + goto out_free_fh; } else { /* * Mark parent "impure" because it may now contain non-pure * upper */ + ovl_start_write(c->dentry); err = ovl_set_impure(c->parent, c->destdir); + ovl_end_write(c->dentry); if (err) - return err; + goto out_free_fh; } /* Should we copyup with O_TMPFILE or with workdir? */ @@ -883,6 +952,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) if (c->indexed) ovl_set_flag(OVL_INDEX, d_inode(c->dentry)); + ovl_start_write(c->dentry); if (to_index) { /* Initialize nlink for copy up of disconnected dentry */ err = ovl_set_nlink_upper(c->dentry); @@ -897,17 +967,20 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry)); } + ovl_end_write(c->dentry); out: if (to_index) kfree(c->destname.name); +out_free_fh: + kfree(fh); return err; } static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, int flags) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); if (!ofs->config.metacopy) return false; @@ -918,6 +991,19 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode, if (flags && ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC))) return false; + /* Fall back to full copy if no fsverity on source data and we require verity */ + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + struct path lowerdata; + + ovl_path_lowerdata(dentry, &lowerdata); + + if (WARN_ON_ONCE(lowerdata.dentry == NULL) || + ovl_ensure_verity_loaded(&lowerdata) || + !fsverity_active(d_inode(lowerdata.dentry))) { + return false; + } + } + return true; } @@ -972,18 +1058,21 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * Writing to upper file will clear security.capability xattr. We * don't want that to happen for normal copy-up operation. */ + ovl_start_write(c->dentry); if (capability) { err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS, capability, cap_size, 0); - if (err) - goto out_free; } - - - err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY); + if (!err) { + err = ovl_removexattr(ofs, upperpath.dentry, + OVL_XATTR_METACOPY); + } + ovl_end_write(c->dentry); if (err) goto out_free; + ovl_clear_flag(OVL_HAS_DIGEST, d_inode(c->dentry)); + ovl_clear_flag(OVL_VERIFIED_DIGEST, d_inode(c->dentry)); ovl_set_upperdata(d_inode(c->dentry)); out_free: kfree(capability); @@ -1078,7 +1167,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) * not very important to optimize this case, so do lazy lowerdata lookup * before any copy up, so we can do it before taking ovl_inode_lock(). */ - err = ovl_maybe_lookup_lowerdata(dentry); + err = ovl_verify_lowerdata(dentry); if (err) return err; @@ -1129,17 +1218,10 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) int ovl_maybe_copy_up(struct dentry *dentry, int flags) { - int err = 0; - - if (ovl_open_need_copy_up(dentry, flags)) { - err = ovl_want_write(dentry); - if (!err) { - err = ovl_copy_up_flags(dentry, flags); - ovl_drop_write(dentry); - } - } + if (!ovl_open_need_copy_up(dentry, flags)) + return 0; - return err; + return ovl_copy_up_flags(dentry, flags); } int ovl_copy_up_with_data(struct dentry *dentry) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 033fc0458a3d..aab3f5d93556 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -477,7 +477,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, goto out_unlock; err = -ESTALE; - if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper))) + if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper)) goto out_dput; newdentry = ovl_create_temp(ofs, workdir, cattr); @@ -559,10 +559,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, struct cred *override_cred; struct dentry *parent = dentry->d_parent; - err = ovl_copy_up(parent); - if (err) - return err; - old_cred = ovl_override_creds(dentry->d_sb); /* @@ -626,6 +622,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, .link = link, }; + err = ovl_copy_up(dentry->d_parent); + if (err) + return err; + err = ovl_want_write(dentry); if (err) goto out; @@ -700,28 +700,24 @@ static int ovl_link(struct dentry *old, struct inode *newdir, int err; struct inode *inode; - err = ovl_want_write(old); + err = ovl_copy_up(old); if (err) goto out; - err = ovl_copy_up(old); + err = ovl_copy_up(new->d_parent); if (err) - goto out_drop_write; + goto out; - err = ovl_copy_up(new->d_parent); + err = ovl_nlink_start(old); if (err) - goto out_drop_write; + goto out; if (ovl_is_metacopy_dentry(old)) { err = ovl_set_link_redirect(old); if (err) - goto out_drop_write; + goto out_nlink_end; } - err = ovl_nlink_start(old); - if (err) - goto out_drop_write; - inode = d_inode(old); ihold(inode); @@ -731,9 +727,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) iput(inode); +out_nlink_end: ovl_nlink_end(old); -out_drop_write: - ovl_drop_write(old); out: return err; } @@ -891,17 +886,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out; } - err = ovl_want_write(dentry); - if (err) - goto out; - err = ovl_copy_up(dentry->d_parent); if (err) - goto out_drop_write; + goto out; err = ovl_nlink_start(dentry); if (err) - goto out_drop_write; + goto out; old_cred = ovl_override_creds(dentry->d_sb); if (!lower_positive) @@ -926,8 +917,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (ovl_dentry_upper(dentry)) ovl_copyattr(d_inode(dentry)); -out_drop_write: - ovl_drop_write(dentry); out: ovl_cache_free(&list); return err; @@ -1131,29 +1120,32 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } } - err = ovl_want_write(old); - if (err) - goto out; - err = ovl_copy_up(old); if (err) - goto out_drop_write; + goto out; err = ovl_copy_up(new->d_parent); if (err) - goto out_drop_write; + goto out; if (!overwrite) { err = ovl_copy_up(new); if (err) - goto out_drop_write; + goto out; } else if (d_inode(new)) { err = ovl_nlink_start(new); if (err) - goto out_drop_write; + goto out; update_nlink = true; } + if (!update_nlink) { + /* ovl_nlink_start() took ovl_want_write() */ + err = ovl_want_write(old); + if (err) + goto out; + } + old_cred = ovl_override_creds(old->d_sb); if (!list_empty(&list)) { @@ -1219,7 +1211,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir, } } else { if (!d_is_negative(newdentry)) { - if (!new_opaque || !ovl_is_whiteout(newdentry)) + if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry)) goto out_dput; } else { if (flags & RENAME_EXCHANGE) @@ -1286,8 +1278,8 @@ out_revert_creds: revert_creds(old_cred); if (update_nlink) ovl_nlink_end(new); -out_drop_write: - ovl_drop_write(old); + else + ovl_drop_write(old); out: dput(opaquedir); ovl_cache_free(&list); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 35680b6e175b..7e16bbcad95e 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -23,12 +23,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry) if (ovl_dentry_upper(dentry)) return 0; - err = ovl_want_write(dentry); - if (!err) { - err = ovl_copy_up(dentry); - ovl_drop_write(dentry); - } - + err = ovl_copy_up(dentry); if (err) { pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); @@ -174,28 +169,37 @@ static int ovl_connect_layer(struct dentry *dentry) * U = upper file handle * L = lower file handle * - * (*) Connecting an overlay dir from real lower dentry is not always + * (*) Decoding a connected overlay dir from real lower dentry is not always * possible when there are redirects in lower layers and non-indexed merge dirs. * To mitigate those case, we may copy up the lower dir ancestor before encode - * a lower dir file handle. + * of a decodable file handle for non-upper dir. * * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ static int ovl_check_encode_origin(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + bool decodable = ofs->config.nfs_export; + + /* Lower file handle for non-upper non-decodable */ + if (!ovl_dentry_upper(dentry) && !decodable) + return 1; /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) return 0; /* - * Upper file handle for non-indexed upper. - * * Root is never indexed, so if there's an upper layer, encode upper for * root. */ - if (ovl_dentry_upper(dentry) && + if (dentry == dentry->d_sb->s_root) + return 0; + + /* + * Upper decodable file handle for non-indexed upper. + */ + if (ovl_dentry_upper(dentry) && decodable && !ovl_test_flag(OVL_INDEX, d_inode(dentry))) return 0; @@ -205,7 +209,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) * ovl_connect_layer() will try to make origin's layer "connected" by * copying up a "connectable" ancestor. */ - if (d_is_dir(dentry) && ovl_upper_mnt(ofs)) + if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable) return ovl_connect_layer(dentry); /* Lower file handle for indexed and non-upper dir/non-dir */ @@ -435,7 +439,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, struct dentry *real, const struct ovl_layer *layer) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct dentry *index = NULL; struct dentry *this = NULL; struct inode *inode; @@ -656,7 +660,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb, struct ovl_path *lowerpath, struct dentry *index) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; struct dentry *real = upper ?: (index ?: lowerpath->dentry); @@ -681,7 +685,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb, static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct dentry *dentry; struct dentry *upper; @@ -701,7 +705,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb, static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, struct ovl_fh *fh) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct ovl_path origin = { }; struct ovl_path *stack = &origin; struct dentry *dentry = NULL; @@ -876,3 +880,8 @@ const struct export_operations ovl_export_operations = { .get_name = ovl_get_name, .get_parent = ovl_get_parent, }; + +/* encode_fh() encodes non-decodable file handles with nfs_export=off */ +const struct export_operations ovl_export_fid_operations = { + .encode_fh = ovl_encode_fh, +}; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 21245b00722a..131621daeb13 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -15,11 +15,15 @@ #include <linux/fs.h> #include "overlayfs.h" +#include "../internal.h" /* for sb_init_dio_done_wq */ + struct ovl_aio_req { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; - struct fd fd; + /* used for aio completion */ + struct work_struct work; + long res; }; static struct kmem_cache *ovl_aio_request_cachep; @@ -115,8 +119,8 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real, if (allow_meta) { ovl_path_real(dentry, &realpath); } else { - /* lazy lookup of lowerdata */ - err = ovl_maybe_lookup_lowerdata(dentry); + /* lazy lookup and verify of lowerdata */ + err = ovl_verify_lowerdata(dentry); if (err) return err; @@ -159,8 +163,8 @@ static int ovl_open(struct inode *inode, struct file *file) struct path realpath; int err; - /* lazy lookup of lowerdata */ - err = ovl_maybe_lookup_lowerdata(dentry); + /* lazy lookup and verify lowerdata */ + err = ovl_verify_lowerdata(dentry); if (err) return err; @@ -236,9 +240,17 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) return ret; } +static void ovl_file_modified(struct file *file) +{ + /* Update size/mtime */ + ovl_copyattr(file_inode(file)); +} + static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; + struct timespec64 ctime, uctime; + struct timespec64 mtime, umtime; if (file->f_flags & O_NOATIME) return; @@ -249,35 +261,31 @@ static void ovl_file_accessed(struct file *file) if (!upperinode) return; - if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) || - !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) { - inode->i_mtime = upperinode->i_mtime; - inode->i_ctime = upperinode->i_ctime; + ctime = inode_get_ctime(inode); + uctime = inode_get_ctime(upperinode); + mtime = inode_get_mtime(inode); + umtime = inode_get_mtime(upperinode); + if ((!timespec64_equal(&mtime, &umtime)) || + !timespec64_equal(&ctime, &uctime)) { + inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode)); + inode_set_ctime_to_ts(inode, uctime); } touch_atime(&file->f_path); } -static rwf_t ovl_iocb_to_rwf(int ifl) +#define OVL_IOCB_MASK \ + (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) + +static rwf_t iocb_to_rw_flags(int flags) { - rwf_t flags = 0; - - if (ifl & IOCB_NOWAIT) - flags |= RWF_NOWAIT; - if (ifl & IOCB_HIPRI) - flags |= RWF_HIPRI; - if (ifl & IOCB_DSYNC) - flags |= RWF_DSYNC; - if (ifl & IOCB_SYNC) - flags |= RWF_SYNC; - - return flags; + return (__force rwf_t)(flags & OVL_IOCB_MASK); } static inline void ovl_aio_put(struct ovl_aio_req *aio_req) { if (refcount_dec_and_test(&aio_req->ref)) { - fdput(aio_req->fd); + fput(aio_req->iocb.ki_filp); kmem_cache_free(ovl_aio_request_cachep, aio_req); } } @@ -288,13 +296,8 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) struct kiocb *orig_iocb = aio_req->orig_iocb; if (iocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(orig_iocb->ki_filp); - - /* Actually acquired in ovl_write_iter() */ - __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, - SB_FREEZE_WRITE); - file_end_write(iocb->ki_filp); - ovl_copyattr(inode); + kiocb_end_write(iocb); + ovl_file_modified(orig_iocb->ki_filp); } orig_iocb->ki_pos = iocb->ki_pos; @@ -311,6 +314,37 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res) orig_iocb->ki_complete(orig_iocb, res); } +static void ovl_aio_complete_work(struct work_struct *work) +{ + struct ovl_aio_req *aio_req = container_of(work, + struct ovl_aio_req, work); + + ovl_aio_rw_complete(&aio_req->iocb, aio_req->res); +} + +static void ovl_aio_queue_completion(struct kiocb *iocb, long res) +{ + struct ovl_aio_req *aio_req = container_of(iocb, + struct ovl_aio_req, iocb); + struct kiocb *orig_iocb = aio_req->orig_iocb; + + /* + * Punt to a work queue to serialize updates of mtime/size. + */ + aio_req->res = res; + INIT_WORK(&aio_req->work, ovl_aio_complete_work); + queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq, + &aio_req->work); +} + +static int ovl_init_aio_done_wq(struct super_block *sb) +{ + if (sb->s_dio_done_wq) + return 0; + + return sb_init_dio_done_wq(sb); +} + static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -332,8 +366,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb->ki_flags)); + rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags); + + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf); } else { struct ovl_aio_req *aio_req; @@ -342,10 +377,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; - real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); @@ -393,33 +426,39 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + /* + * Overlayfs doesn't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + ifl &= ~IOCB_DIO_CALLER_COMP; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { + rwf_t rwf = iocb_to_rw_flags(ifl); + file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(ifl)); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf); file_end_write(real.file); /* Update size */ - ovl_copyattr(inode); + ovl_file_modified(file); } else { struct ovl_aio_req *aio_req; + ret = ovl_init_aio_done_wq(inode->i_sb); + if (ret) + goto out; + ret = -ENOMEM; aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); if (!aio_req) goto out; - file_start_write(real.file); - /* Pacify lockdep, same trick as done in aio_write() */ - __sb_writers_release(file_inode(real.file)->i_sb, - SB_FREEZE_WRITE); - aio_req->fd = real; - real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; - aio_req->iocb.ki_complete = ovl_aio_rw_complete; + aio_req->iocb.ki_complete = ovl_aio_queue_completion; refcount_set(&aio_req->ref, 2); + kiocb_start_write(&aio_req->iocb); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) @@ -491,7 +530,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, file_end_write(real.file); /* Update size */ - ovl_copyattr(inode); + ovl_file_modified(out); revert_creds(old_cred); fdput(real); @@ -572,7 +611,7 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len revert_creds(old_cred); /* Update size */ - ovl_copyattr(inode); + ovl_file_modified(file); fdput(real); @@ -656,7 +695,7 @@ static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, revert_creds(old_cred); /* Update size */ - ovl_copyattr(inode_out); + ovl_file_modified(file_out); fdput(real_in); fdput(real_out); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index a63e57447be9..c63b31a460be 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -32,10 +32,6 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (err) return err; - err = ovl_want_write(dentry); - if (err) - goto out; - if (attr->ia_valid & ATTR_SIZE) { /* Truncate should trigger data copy up as well */ full_copy_up = true; @@ -54,7 +50,7 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, winode = d_inode(upperdentry); err = get_write_access(winode); if (err) - goto out_drop_write; + goto out; } if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) @@ -78,6 +74,10 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, */ attr->ia_valid &= ~ATTR_OPEN; + err = ovl_want_write(dentry); + if (err) + goto out_put_write; + inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = ovl_do_notify_change(ofs, upperdentry, attr); @@ -85,12 +85,12 @@ int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (!err) ovl_copyattr(dentry->d_inode); inode_unlock(upperdentry->d_inode); + ovl_drop_write(dentry); +out_put_write: if (winode) put_write_access(winode); } -out_drop_write: - ovl_drop_write(dentry); out: return err; } @@ -171,7 +171,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat, request_mask, flags); + err = ovl_do_getattr(&realpath, stat, request_mask, flags); if (err) goto out; @@ -196,8 +196,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, (!is_dir ? STATX_NLINK : 0); ovl_path_lower(dentry, &realpath); - err = vfs_getattr(&realpath, &lowerstat, - lowermask, flags); + err = ovl_do_getattr(&realpath, &lowerstat, lowermask, + flags); if (err) goto out; @@ -249,8 +249,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, ovl_path_lowerdata(dentry, &realpath); if (realpath.dentry) { - err = vfs_getattr(&realpath, &lowerdatastat, - lowermask, flags); + err = ovl_do_getattr(&realpath, &lowerdatastat, + lowermask, flags); if (err) goto out; } else { @@ -339,130 +339,6 @@ static const char *ovl_get_link(struct dentry *dentry, return p; } -bool ovl_is_private_xattr(struct super_block *sb, const char *name) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - if (ofs->config.userxattr) - return strncmp(name, OVL_XATTR_USER_PREFIX, - sizeof(OVL_XATTR_USER_PREFIX) - 1) == 0; - else - return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, - sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) == 0; -} - -int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags) -{ - int err; - struct ovl_fs *ofs = OVL_FS(dentry->d_sb); - struct dentry *upperdentry = ovl_i_dentry_upper(inode); - struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); - struct path realpath; - const struct cred *old_cred; - - err = ovl_want_write(dentry); - if (err) - goto out; - - if (!value && !upperdentry) { - ovl_path_lower(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); - revert_creds(old_cred); - if (err < 0) - goto out_drop_write; - } - - if (!upperdentry) { - err = ovl_copy_up(dentry); - if (err) - goto out_drop_write; - - realdentry = ovl_dentry_upper(dentry); - } - - old_cred = ovl_override_creds(dentry->d_sb); - if (value) { - err = ovl_do_setxattr(ofs, realdentry, name, value, size, - flags); - } else { - WARN_ON(flags != XATTR_REPLACE); - err = ovl_do_removexattr(ofs, realdentry, name); - } - revert_creds(old_cred); - - /* copy c/mtime */ - ovl_copyattr(inode); - -out_drop_write: - ovl_drop_write(dentry); -out: - return err; -} - -int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, - void *value, size_t size) -{ - ssize_t res; - const struct cred *old_cred; - struct path realpath; - - ovl_i_path_real(inode, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); - revert_creds(old_cred); - return res; -} - -static bool ovl_can_list(struct super_block *sb, const char *s) -{ - /* Never list private (.overlay) */ - if (ovl_is_private_xattr(sb, s)) - return false; - - /* List all non-trusted xattrs */ - if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) - return true; - - /* list other trusted for superuser only */ - return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); -} - -ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) -{ - struct dentry *realdentry = ovl_dentry_real(dentry); - ssize_t res; - size_t len; - char *s; - const struct cred *old_cred; - - old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_listxattr(realdentry, list, size); - revert_creds(old_cred); - if (res <= 0 || size == 0) - return res; - - /* filter out private xattrs */ - for (s = list, len = res; len;) { - size_t slen = strnlen(s, len) + 1; - - /* underlying fs providing us with an broken xattr list? */ - if (WARN_ON(slen > len)) - return -EIO; - - len -= slen; - if (!ovl_can_list(dentry->d_sb, s)) { - res -= slen; - memmove(s, s + slen, len); - } else { - s += slen; - } - } - - return res; -} - #ifdef CONFIG_FS_POSIX_ACL /* * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone @@ -611,10 +487,6 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, struct dentry *upperdentry = ovl_dentry_upper(dentry); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); - err = ovl_want_write(dentry); - if (err) - return err; - /* * If ACL is to be removed from a lower file, check if it exists in * the first place before copying it up. @@ -630,7 +502,7 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, revert_creds(old_cred); if (IS_ERR(real_acl)) { err = PTR_ERR(real_acl); - goto out_drop_write; + goto out; } posix_acl_release(real_acl); } @@ -638,23 +510,26 @@ static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, if (!upperdentry) { err = ovl_copy_up(dentry); if (err) - goto out_drop_write; + goto out; realdentry = ovl_dentry_upper(dentry); } + err = ovl_want_write(dentry); + if (err) + goto out; + old_cred = ovl_override_creds(dentry->d_sb); if (acl) err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); else err = ovl_do_remove_acl(ofs, realdentry, acl_name); revert_creds(old_cred); + ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); - -out_drop_write: - ovl_drop_write(dentry); +out: return err; } @@ -693,10 +568,10 @@ int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, } #endif -int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) +int ovl_update_time(struct inode *inode, int flags) { if (flags & S_ATIME) { - struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(inode->i_sb); struct path upperpath = { .mnt = ovl_upper_mnt(ofs), .dentry = ovl_upperdentry_dereference(OVL_I(inode)), @@ -704,7 +579,8 @@ int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) if (upperpath.dentry) { touch_atime(&upperpath); - inode->i_atime = d_inode(upperpath.dentry)->i_atime; + inode_set_atime_to_ts(inode, + inode_get_atime(d_inode(upperpath.dentry))); } } return 0; @@ -777,14 +653,14 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, unsigned int flags; int err; - err = ovl_want_write(dentry); - if (err) - goto out; - err = ovl_copy_up(dentry); if (!err) { ovl_path_real(dentry, &upperpath); + err = ovl_want_write(dentry); + if (err) + goto out; + old_cred = ovl_override_creds(inode->i_sb); /* * Store immutable/append-only flags in xattr and clear them @@ -797,6 +673,7 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, if (!err) err = ovl_real_fileattr_set(&upperpath, fa); revert_creds(old_cred); + ovl_drop_write(dentry); /* * Merge real inode flags with inode flags read from @@ -811,7 +688,6 @@ int ovl_fileattr_set(struct mnt_idmap *idmap, /* Update ctime */ ovl_copyattr(inode); } - ovl_drop_write(dentry); out: return err; } @@ -1291,7 +1167,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, struct dentry *lower, bool index) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); /* No, if pure upper */ if (!lower) @@ -1311,7 +1187,7 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, return false; /* No, if non-indexed upper with NFS export */ - if (sb->s_export_op && upper) + if (ofs->config.nfs_export && upper) return false; /* Otherwise, hash by lower inode for fsnotify */ diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 57adf911735f..03bc8d5dfa31 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -25,7 +25,7 @@ struct ovl_lookup_data { bool stop; bool last; char *redirect; - bool metacopy; + int metacopy; /* Referring to last redirect xattr */ bool absolute_redirect; }; @@ -171,8 +171,9 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, * layer where file handle will be decoded. * In case of uuid=off option just make sure that stored uuid is null. */ - if (ofs->config.uuid ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : - !uuid_is_null(&fh->fb.uuid)) + if (ovl_origin_uuid(ofs) ? + !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : + !uuid_is_null(&fh->fb.uuid)) return NULL; bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); @@ -250,7 +251,10 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, err = -EREMOTE; goto out_err; } - if (ovl_is_whiteout(this)) { + + path.dentry = this; + path.mnt = d->mnt; + if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) { d->stop = d->opaque = true; goto put_and_out; } @@ -263,14 +267,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto put_and_out; } - path.dentry = this; - path.mnt = d->mnt; if (!d_can_lookup(this)) { if (d->is_dir || !last_element) { d->stop = true; goto put_and_out; } - err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path); + err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL); if (err < 0) goto out_err; @@ -437,7 +439,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, else if (IS_ERR(origin)) return PTR_ERR(origin); - if (upperdentry && !ovl_is_whiteout(upperdentry) && + if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) && inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) goto invalid; @@ -506,6 +508,19 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, return err; } +int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, const struct ovl_fh *fh, + bool is_upper, bool set) +{ + int err; + + err = ovl_verify_fh(ofs, dentry, ox, fh); + if (set && err == -ENODATA) + err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + + return err; +} + /* * Verify that @real dentry matches the file handle stored in xattr @name. * @@ -514,9 +529,9 @@ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, * * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ -int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, struct dentry *real, bool is_upper, - bool set) +int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, + bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; @@ -529,9 +544,7 @@ int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, goto fail; } - err = ovl_verify_fh(ofs, dentry, ox, fh); - if (set && err == -ENODATA) - err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); + err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set); if (err) goto fail; @@ -547,6 +560,7 @@ fail: goto out; } + /* Get upper dentry from index */ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected) @@ -683,7 +697,7 @@ orphan: goto out; } -static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) +int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name) { char *n, *s; @@ -872,25 +886,82 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, struct dentry *lower, struct dentry *upper) { + const struct ovl_fh *fh; int err; if (ovl_check_origin_xattr(ofs, upper)) return 0; + fh = ovl_get_origin_fh(ofs, lower); + if (IS_ERR(fh)) + return PTR_ERR(fh); + err = ovl_want_write(dentry); if (err) - return err; + goto out; - err = ovl_set_origin(ofs, lower, upper); + err = ovl_set_origin_fh(ofs, fh, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); ovl_drop_write(dentry); +out: + kfree(fh); + return err; +} + +static int ovl_maybe_validate_verity(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct inode *inode = d_inode(dentry); + struct path datapath, metapath; + int err; + + if (!ofs->config.verity_mode || + !ovl_is_metacopy_dentry(dentry) || + ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) + return 0; + + if (!ovl_test_flag(OVL_HAS_DIGEST, inode)) { + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", + dentry); + return -EIO; + } + return 0; + } + + ovl_path_lowerdata(dentry, &datapath); + if (!datapath.dentry) + return -EIO; + + ovl_path_real(dentry, &metapath); + if (!metapath.dentry) + return -EIO; + + err = ovl_inode_lock_interruptible(inode); + if (err) + return err; + + if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) { + const struct cred *old_cred; + + old_cred = ovl_override_creds(dentry->d_sb); + + err = ovl_validate_verity(ofs, &metapath, &datapath); + if (err == 0) + ovl_set_flag(OVL_VERIFIED_DIGEST, inode); + + revert_creds(old_cred); + } + + ovl_inode_unlock(inode); + return err; } /* Lazy lookup of lowerdata */ -int ovl_maybe_lookup_lowerdata(struct dentry *dentry) +static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) { struct inode *inode = d_inode(dentry); const char *redirect = ovl_lowerdata_redirect(inode); @@ -935,12 +1006,23 @@ out_err: goto out; } +int ovl_verify_lowerdata(struct dentry *dentry) +{ + int err; + + err = ovl_maybe_lookup_lowerdata(dentry); + if (err) + return err; + + return ovl_maybe_validate_verity(dentry); +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe = NULL; const struct cred *old_cred; - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); struct ovl_path *stack = NULL, *origin_path = NULL; @@ -955,6 +1037,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int i; int err; bool uppermetacopy = false; + int metacopy_size = 0; struct ovl_lookup_data d = { .sb = dentry->d_sb, .name = dentry->d_name, @@ -963,7 +1046,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .stop = false, .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), .redirect = NULL, - .metacopy = false, + .metacopy = 0, }; if (dentry->d_name.len > ofs->namelen) @@ -999,6 +1082,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.metacopy) uppermetacopy = true; + metacopy_size = d.metacopy; } if (d.redirect) { @@ -1076,6 +1160,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, origin = this; } + if (!upperdentry && !d.is_dir && !ctr && d.metacopy) + metacopy_size = d.metacopy; + if (d.metacopy && ctr) { /* * Do not store intermediate metacopy dentries in @@ -1120,7 +1207,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, /* Defer lookup of lowerdata in data-only layers to first access */ if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { - d.metacopy = false; + d.metacopy = 0; ctr++; } @@ -1211,10 +1298,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperredirect = NULL; goto out_free_oe; } - err = ovl_check_metacopy_xattr(ofs, &upperpath); + err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) goto out_free_oe; uppermetacopy = err; + metacopy_size = err; } if (upperdentry || ctr) { @@ -1236,6 +1324,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_free_oe; if (upperdentry && !uppermetacopy) ovl_set_flag(OVL_UPPERDATA, inode); + + if (metacopy_size > OVL_METACOPY_MIN_SIZE) + ovl_set_flag(OVL_HAS_DIGEST, inode); } ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); @@ -1312,7 +1403,11 @@ bool ovl_lower_positive(struct dentry *dentry) break; } } else { - positive = !ovl_is_whiteout(this); + struct path path = { + .dentry = this, + .mnt = parentpath->layer->mnt, + }; + positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); done = true; dput(this); } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 9402591f12aa..05c3dd597fa8 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/uuid.h> #include <linux/fs.h> +#include <linux/fsverity.h> #include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> @@ -27,7 +28,16 @@ enum ovl_path_type { #define OVL_XATTR_NAMESPACE "overlay." #define OVL_XATTR_TRUSTED_PREFIX XATTR_TRUSTED_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_TRUSTED_PREFIX) - 1) #define OVL_XATTR_USER_PREFIX XATTR_USER_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_USER_PREFIX_LEN (sizeof(OVL_XATTR_USER_PREFIX) - 1) + +#define OVL_XATTR_ESCAPE_PREFIX OVL_XATTR_NAMESPACE +#define OVL_XATTR_ESCAPE_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_PREFIX) - 1) +#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX OVL_XATTR_TRUSTED_PREFIX OVL_XATTR_ESCAPE_PREFIX +#define OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_TRUSTED_PREFIX) - 1) +#define OVL_XATTR_ESCAPE_USER_PREFIX OVL_XATTR_USER_PREFIX OVL_XATTR_ESCAPE_PREFIX +#define OVL_XATTR_ESCAPE_USER_PREFIX_LEN (sizeof(OVL_XATTR_ESCAPE_USER_PREFIX) - 1) enum ovl_xattr { OVL_XATTR_OPAQUE, @@ -36,8 +46,11 @@ enum ovl_xattr { OVL_XATTR_IMPURE, OVL_XATTR_NLINK, OVL_XATTR_UPPER, + OVL_XATTR_UUID, OVL_XATTR_METACOPY, OVL_XATTR_PROTATTR, + OVL_XATTR_XWHITEOUT, + OVL_XATTR_XWHITEOUTS, }; enum ovl_inode_flag { @@ -49,6 +62,8 @@ enum ovl_inode_flag { OVL_UPPERDATA, /* Inode number will remain constant over copy up. */ OVL_CONST_INO, + OVL_HAS_DIGEST, + OVL_VERIFIED_DIGEST, }; enum ovl_entry_flag { @@ -65,11 +80,24 @@ enum { }; enum { + OVL_UUID_OFF, + OVL_UUID_NULL, + OVL_UUID_AUTO, + OVL_UUID_ON, +}; + +enum { OVL_XINO_OFF, OVL_XINO_AUTO, OVL_XINO_ON, }; +enum { + OVL_VERITY_OFF, + OVL_VERITY_ON, + OVL_VERITY_REQUIRE, +}; + /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: @@ -126,6 +154,26 @@ struct ovl_fh { #define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ offsetof(struct ovl_fb, fid)) +/* On-disk format for "metacopy" xattr (if non-zero size) */ +struct ovl_metacopy { + u8 version; /* 0 */ + u8 len; /* size of this header + used digest bytes */ + u8 flags; + u8 digest_algo; /* FS_VERITY_HASH_ALG_* constant, 0 for no digest */ + u8 digest[FS_VERITY_MAX_DIGEST_SIZE]; /* Only the used part on disk */ +} __packed; + +#define OVL_METACOPY_MAX_SIZE (sizeof(struct ovl_metacopy)) +#define OVL_METACOPY_MIN_SIZE (OVL_METACOPY_MAX_SIZE - FS_VERITY_MAX_DIGEST_SIZE) +#define OVL_METACOPY_INIT { 0, OVL_METACOPY_MIN_SIZE } + +static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy) +{ + if (metacopy->len < OVL_METACOPY_MIN_SIZE) + return 0; + return (int)metacopy->len - OVL_METACOPY_MIN_SIZE; +} + extern const char *const ovl_xattr_table[][2]; static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) { @@ -360,7 +408,19 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } +static inline int ovl_do_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + if (flags & AT_GETATTR_NOSEC) + return vfs_getattr_nosec(path, stat, request_mask, flags); + return vfs_getattr(path, stat, request_mask, flags); +} + /* util.c */ +int ovl_get_write_access(struct dentry *dentry); +void ovl_put_write_access(struct dentry *dentry); +void ovl_start_write(struct dentry *dentry); +void ovl_end_write(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); @@ -423,6 +483,7 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_inode_version_get(struct inode *inode); bool ovl_is_whiteout(struct dentry *dentry); +bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path); struct file *ovl_path_open(const struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry, int flags); void ovl_copy_up_end(struct dentry *dentry); @@ -430,6 +491,20 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags); bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox); bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path); +bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path); +bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path); +bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, + const struct path *upperpath); + +static inline bool ovl_upper_is_whiteout(struct ovl_fs *ofs, + struct dentry *upperdentry) +{ + struct path upperpath = { + .dentry = upperdentry, + .mnt = ovl_upper_mnt(ofs), + }; + return ovl_path_is_whiteout(ofs, &upperpath); +} static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *upperdentry) @@ -452,9 +527,20 @@ bool ovl_need_index(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry); void ovl_nlink_end(struct dentry *dentry); int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path); +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, + struct ovl_metacopy *data); +int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, + struct ovl_metacopy *metacopy); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); +int ovl_ensure_verity_loaded(struct path *path); +int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, + u8 *digest_buf, int *buf_length); +int ovl_validate_verity(struct ovl_fs *ofs, + struct path *metapath, + struct path *datapath); +int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, + struct ovl_metacopy *metacopy); int ovl_sync_status(struct ovl_fs *ofs); static inline void ovl_set_flag(unsigned long flag, struct inode *inode) @@ -494,6 +580,17 @@ static inline bool ovl_redirect_dir(struct ovl_fs *ofs) return ofs->config.redirect_mode == OVL_REDIRECT_ON; } +static inline bool ovl_origin_uuid(struct ovl_fs *ofs) +{ + return ofs->config.uuid != OVL_UUID_OFF; +} + +static inline bool ovl_has_fsid(struct ovl_fs *ofs) +{ + return ofs->config.uuid == OVL_UUID_ON || + ofs->config.uuid == OVL_UUID_AUTO; +} + /* * With xino=auto, we do best effort to keep all inodes on same st_dev and * d_ino consistent with st_ino. @@ -563,33 +660,44 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp); int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, - enum ovl_xattr ox, struct dentry *real, bool is_upper, - bool set); + enum ovl_xattr ox, const struct ovl_fh *fh, + bool is_upper, bool set); +int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, + enum ovl_xattr ox, struct dentry *real, + bool is_upper, bool set); struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected); int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index); +int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name); int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name); struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); int ovl_path_next(int idx, struct dentry *dentry, struct path *path); -int ovl_maybe_lookup_lowerdata(struct dentry *dentry); +int ovl_verify_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); +static inline int ovl_verify_origin_fh(struct ovl_fs *ofs, struct dentry *upper, + const struct ovl_fh *fh, bool set) +{ + return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, fh, false, set); +} + static inline int ovl_verify_origin(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool set) { - return ovl_verify_set_fh(ofs, upper, OVL_XATTR_ORIGIN, origin, - false, set); + return ovl_verify_origin_xattr(ofs, upper, OVL_XATTR_ORIGIN, origin, + false, set); } static inline int ovl_verify_upper(struct ovl_fs *ofs, struct dentry *index, struct dentry *upper, bool set) { - return ovl_verify_set_fh(ofs, index, OVL_XATTR_UPPER, upper, true, set); + return ovl_verify_origin_xattr(ofs, index, OVL_XATTR_UPPER, upper, + true, set); } /* readdir.c */ @@ -623,17 +731,8 @@ int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); -int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, - struct iattr *attr); -int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, - struct kstat *stat, u32 request_mask, unsigned int flags); int ovl_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags); -int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, - void *value, size_t size); -ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, @@ -665,7 +764,7 @@ static inline struct posix_acl *ovl_get_acl_path(const struct path *path, } #endif -int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); +int ovl_update_time(struct inode *inode, int flags); bool ovl_is_private_xattr(struct super_block *sb, const char *name); struct ovl_inode_params { @@ -754,11 +853,13 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *path, struct dentr int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, - struct dentry *upper); +struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin); +int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh, + struct dentry *upper); /* export.c */ extern const struct export_operations ovl_export_operations; +extern const struct export_operations ovl_export_fid_operations; /* super.c */ int ovl_fill_super(struct super_block *sb, struct fs_context *fc); @@ -768,3 +869,12 @@ static inline bool ovl_force_readonly(struct ovl_fs *ofs) { return (!ovl_upper_mnt(ofs) || !ofs->workdir); } + +/* xattr.c */ + +const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs); +int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); +int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 306e1ecdc96d..d82d2a043da2 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -8,10 +8,12 @@ struct ovl_config { char *upperdir; char *workdir; + char **lowerdirs; bool default_permissions; int redirect_mode; + int verity_mode; bool index; - bool uuid; + int uuid; bool nfs_export; int xino; bool metacopy; @@ -38,17 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; - char *name; }; -/* - * ovl_free_fs() relies on @mnt being the first member when unmounting - * the private mounts created for each layer. Let's check both the - * offset and type. - */ -static_assert(offsetof(struct ovl_layer, mnt) == 0); -static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *)); - struct ovl_path { const struct ovl_layer *layer; struct dentry *dentry; @@ -81,6 +74,7 @@ struct ovl_fs { const struct cred *creator_cred; bool tmpfile; bool noxattr; + bool nofh; /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; @@ -115,8 +109,13 @@ static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs) return mnt_idmap(ovl_upper_mnt(ofs)); } +extern struct file_system_type ovl_fs_type; + static inline struct ovl_fs *OVL_FS(struct super_block *sb) { + if (IS_ENABLED(CONFIG_OVERLAY_FS_DEBUG)) + WARN_ON_ONCE(sb->s_type != &ovl_fs_type); + return (struct ovl_fs *)sb->s_fs_info; } diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index a63160dbb0f9..3fe2dde1598f 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -43,8 +43,10 @@ module_param_named(metacopy, ovl_metacopy_def, bool, 0644); MODULE_PARM_DESC(metacopy, "Default to on or off for the metadata only copy up feature"); -enum { +enum ovl_opt { Opt_lowerdir, + Opt_lowerdir_add, + Opt_datadir_add, Opt_upperdir, Opt_workdir, Opt_default_permissions, @@ -55,6 +57,7 @@ enum { Opt_userxattr, Opt_xino, Opt_metacopy, + Opt_verity, Opt_volatile, }; @@ -64,6 +67,24 @@ static const struct constant_table ovl_parameter_bool[] = { {} }; +static const struct constant_table ovl_parameter_uuid[] = { + { "off", OVL_UUID_OFF }, + { "null", OVL_UUID_NULL }, + { "auto", OVL_UUID_AUTO }, + { "on", OVL_UUID_ON }, + {} +}; + +static const char *ovl_uuid_mode(struct ovl_config *config) +{ + return ovl_parameter_uuid[config->uuid].name; +} + +static int ovl_uuid_def(void) +{ + return OVL_UUID_AUTO; +} + static const struct constant_table ovl_parameter_xino[] = { { "off", OVL_XINO_OFF }, { "auto", OVL_XINO_AUTO }, @@ -101,25 +122,74 @@ static int ovl_redirect_mode_def(void) OVL_REDIRECT_NOFOLLOW; } +static const struct constant_table ovl_parameter_verity[] = { + { "off", OVL_VERITY_OFF }, + { "on", OVL_VERITY_ON }, + { "require", OVL_VERITY_REQUIRE }, + {} +}; + +static const char *ovl_verity_mode(struct ovl_config *config) +{ + return ovl_parameter_verity[config->verity_mode].name; +} + +static int ovl_verity_mode_def(void) +{ + return OVL_VERITY_OFF; +} + #define fsparam_string_empty(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) + const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_string_empty("lowerdir", Opt_lowerdir), + fsparam_string("lowerdir+", Opt_lowerdir_add), + fsparam_string("datadir+", Opt_datadir_add), fsparam_string("upperdir", Opt_upperdir), fsparam_string("workdir", Opt_workdir), fsparam_flag("default_permissions", Opt_default_permissions), fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir), fsparam_enum("index", Opt_index, ovl_parameter_bool), - fsparam_enum("uuid", Opt_uuid, ovl_parameter_bool), + fsparam_enum("uuid", Opt_uuid, ovl_parameter_uuid), fsparam_enum("nfs_export", Opt_nfs_export, ovl_parameter_bool), fsparam_flag("userxattr", Opt_userxattr), fsparam_enum("xino", Opt_xino, ovl_parameter_xino), fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), + fsparam_enum("verity", Opt_verity, ovl_parameter_verity), fsparam_flag("volatile", Opt_volatile), {} }; +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + +static int ovl_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, ovl_next_opt); +} + static ssize_t ovl_parse_param_split_lowerdirs(char *str) { ssize_t nr_layers = 1, nr_colons = 0; @@ -127,7 +197,8 @@ static ssize_t ovl_parse_param_split_lowerdirs(char *str) for (s = d = str;; s++, d++) { if (*s == '\\') { - s++; + /* keep esc chars in split lowerdir */ + *d++ = *s++; } else if (*s == ':') { bool next_colon = (*(s + 1) == ':'); @@ -172,19 +243,8 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("failed to resolve '%s': %i\n", name, err); goto out; } - err = -EINVAL; - if (ovl_dentry_weird(path->dentry)) { - pr_err("filesystem on '%s' not supported\n", name); - goto out_put; - } - if (!d_is_dir(path->dentry)) { - pr_err("'%s' not a directory\n", name); - goto out_put; - } return 0; -out_put: - path_put_init(path); out: return err; } @@ -210,68 +270,147 @@ static int ovl_mount_dir(const char *name, struct path *path) if (tmp) { ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); - - if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { - pr_err("filesystem on '%s' not supported as upperdir\n", - tmp); - path_put_init(path); - err = -EINVAL; - } kfree(tmp); } return err; } -static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc, - bool workdir) +static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, + enum ovl_opt layer, const char *name, bool upper) { - int err; - struct ovl_fs *ofs = fc->s_fs_info; - struct ovl_config *config = &ofs->config; struct ovl_fs_context *ctx = fc->fs_private; - struct path path; - char *dup; - err = ovl_mount_dir(name, &path); - if (err) - return err; + if (ovl_dentry_weird(path->dentry)) + return invalfc(fc, "filesystem on %s not supported", name); + + if (!d_is_dir(path->dentry)) + return invalfc(fc, "%s is not a directory", name); + /* * Check whether upper path is read-only here to report failures * early. Don't forget to recheck when the superblock is created * as the mount attributes could change. */ - if (__mnt_is_readonly(path.mnt)) { - path_put(&path); - return -EINVAL; + if (upper) { + if (path->dentry->d_flags & DCACHE_OP_REAL) + return invalfc(fc, "filesystem on %s not supported as upperdir", name); + if (__mnt_is_readonly(path->mnt)) + return invalfc(fc, "filesystem on %s is read-only", name); + } else { + if (ctx->lowerdir_all && layer != Opt_lowerdir) + return invalfc(fc, "lowerdir+ and datadir+ cannot follow lowerdir"); + if (ctx->nr_data && layer == Opt_lowerdir_add) + return invalfc(fc, "regular lower layers cannot follow data layers"); + if (ctx->nr == OVL_MAX_STACK) + return invalfc(fc, "too many lower directories, limit is %d", + OVL_MAX_STACK); } + return 0; +} - dup = kstrdup(name, GFP_KERNEL); - if (!dup) { - path_put(&path); +static int ovl_ctx_realloc_lower(struct fs_context *fc) +{ + struct ovl_fs_context *ctx = fc->fs_private; + struct ovl_fs_context_layer *l; + size_t nr; + + if (ctx->nr < ctx->capacity) + return 0; + + nr = min_t(size_t, max(4096 / sizeof(*l), ctx->capacity * 2), + OVL_MAX_STACK); + l = krealloc_array(ctx->lower, nr, sizeof(*l), GFP_KERNEL_ACCOUNT); + if (!l) return -ENOMEM; + + ctx->lower = l; + ctx->capacity = nr; + return 0; +} + +static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, + struct path *path, char **pname) +{ + struct ovl_fs *ofs = fc->s_fs_info; + struct ovl_config *config = &ofs->config; + struct ovl_fs_context *ctx = fc->fs_private; + struct ovl_fs_context_layer *l; + + switch (layer) { + case Opt_workdir: + swap(config->workdir, *pname); + swap(ctx->work, *path); + break; + case Opt_upperdir: + swap(config->upperdir, *pname); + swap(ctx->upper, *path); + break; + case Opt_datadir_add: + ctx->nr_data++; + fallthrough; + case Opt_lowerdir_add: + WARN_ON(ctx->nr >= ctx->capacity); + l = &ctx->lower[ctx->nr++]; + memset(l, 0, sizeof(*l)); + swap(l->name, *pname); + swap(l->path, *path); + break; + default: + WARN_ON(1); } +} - if (workdir) { - kfree(config->workdir); - config->workdir = dup; - path_put(&ctx->work); - ctx->work = path; - } else { - kfree(config->upperdir); - config->upperdir = dup; - path_put(&ctx->upper); - ctx->upper = path; +static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, + enum ovl_opt layer) +{ + char *name = kstrdup(param->string, GFP_KERNEL); + bool upper = (layer == Opt_upperdir || layer == Opt_workdir); + struct path path; + int err; + + if (!name) + return -ENOMEM; + + if (upper) + err = ovl_mount_dir(name, &path); + else + err = ovl_mount_dir_noesc(name, &path); + if (err) + goto out_free; + + err = ovl_mount_dir_check(fc, &path, layer, name, upper); + if (err) + goto out_put; + + if (!upper) { + err = ovl_ctx_realloc_lower(fc); + if (err) + goto out_put; } - return 0; + + /* Store the user provided path string in ctx to show in mountinfo */ + ovl_add_layer(fc, layer, &path, &name); + +out_put: + path_put(&path); +out_free: + kfree(name); + return err; } -static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx) +static void ovl_reset_lowerdirs(struct ovl_fs_context *ctx) { - for (size_t nr = 0; nr < ctx->nr; nr++) { - path_put(&ctx->lower[nr].path); - kfree(ctx->lower[nr].name); - ctx->lower[nr].name = NULL; + struct ovl_fs_context_layer *l = ctx->lower; + + // Reset old user provided lowerdir string + kfree(ctx->lowerdir_all); + ctx->lowerdir_all = NULL; + + for (size_t nr = 0; nr < ctx->nr; nr++, l++) { + path_put(&l->path); + kfree(l->name); + l->name = NULL; } ctx->nr = 0; ctx->nr_data = 0; @@ -280,25 +419,19 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx) /* * Parse lowerdir= mount option: * - * (1) lowerdir=/lower1:/lower2:/lower3::/data1::/data2 + * e.g.: lowerdir=/lower1:/lower2:/lower3::/data1::/data2 * Set "/lower1", "/lower2", and "/lower3" as lower layers and * "/data1" and "/data2" as data lower layers. Any existing lower * layers are replaced. - * (2) lowerdir=:/lower4 - * Append "/lower4" to current stack of lower layers. This requires - * that there already is at least one lower layer configured. - * (3) lowerdir=::/lower5 - * Append data "/lower5" as data lower layer. This requires that - * there's at least one regular lower layer present. */ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { int err; struct ovl_fs_context *ctx = fc->fs_private; struct ovl_fs_context_layer *l; - char *dup = NULL, *dup_iter; - ssize_t nr_lower = 0, nr = 0, nr_data = 0; - bool append = false, data_layer = false; + char *dup = NULL, *iter; + ssize_t nr_lower, nr; + bool data_layer = false; /* * Ensure we're backwards compatible with mount(2) @@ -306,56 +439,21 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) */ /* drop all existing lower layers */ - if (!*name) { - ovl_parse_param_drop_lowerdir(ctx); - return 0; - } + ovl_reset_lowerdirs(ctx); - if (strncmp(name, "::", 2) == 0) { - /* - * This is a data layer. - * There must be at least one regular lower layer - * specified. - */ - if (ctx->nr == 0) { - pr_err("data lower layers without regular lower layers not allowed"); - return -EINVAL; - } - - /* Skip the leading "::". */ - name += 2; - data_layer = true; - /* - * A data layer is automatically an append as there - * must've been at least one regular lower layer. - */ - append = true; - } else if (*name == ':') { - /* - * This is a regular lower layer. - * If users want to append a layer enforce that they - * have already specified a first layer before. It's - * better to be strict. - */ - if (ctx->nr == 0) { - pr_err("cannot append layer if no previous layer has been specified"); - return -EINVAL; - } - - /* - * Once a sequence of data layers has started regular - * lower layers are forbidden. - */ - if (ctx->nr_data > 0) { - pr_err("regular lower layers cannot follow data lower layers"); - return -EINVAL; - } + if (!*name) + return 0; - /* Skip the leading ":". */ - name++; - append = true; + if (*name == ':') { + pr_err("cannot append lower layer"); + return -EINVAL; } + // Store user provided lowerdir string to show in mount options + ctx->lowerdir_all = kstrdup(name, GFP_KERNEL); + if (!ctx->lowerdir_all) + return -ENOMEM; + dup = kstrdup(name, GFP_KERNEL); if (!dup) return -ENOMEM; @@ -365,36 +463,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) if (nr_lower < 0) goto out_err; - if ((nr_lower > OVL_MAX_STACK) || - (append && (size_add(ctx->nr, nr_lower) > OVL_MAX_STACK))) { + if (nr_lower > OVL_MAX_STACK) { pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_err; } - if (!append) - ovl_parse_param_drop_lowerdir(ctx); - - /* - * (1) append - * - * We want nr <= nr_lower <= capacity We know nr > 0 and nr <= - * capacity. If nr == 0 this wouldn't be append. If nr + - * nr_lower is <= capacity then nr <= nr_lower <= capacity - * already holds. If nr + nr_lower exceeds capacity, we realloc. - * - * (2) replace - * - * Ensure we're backwards compatible with mount(2) which allows - * "lowerdir=/a:/b:/c,lowerdir=/d:/e:/f" causing the last - * specified lowerdir mount option to win. - * - * We want nr <= nr_lower <= capacity We know either (i) nr == 0 - * or (ii) nr > 0. We also know nr_lower > 0. The capacity - * could've been changed multiple times already so we only know - * nr <= capacity. If nr + nr_lower > capacity we realloc, - * otherwise nr <= nr_lower <= capacity holds already. - */ - nr_lower += ctx->nr; if (nr_lower > ctx->capacity) { err = -ENOMEM; l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), @@ -406,59 +479,40 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) ctx->capacity = nr_lower; } - /* - * (3) By (1) and (2) we know nr <= nr_lower <= capacity. - * (4) If ctx->nr == 0 => replace - * We have verified above that the lowerdir mount option - * isn't an append, i.e., the lowerdir mount option - * doesn't start with ":" or "::". - * (4.1) The lowerdir mount options only contains regular lower - * layers ":". - * => Nothing to verify. - * (4.2) The lowerdir mount options contains regular ":" and - * data "::" layers. - * => We need to verify that data lower layers "::" aren't - * followed by regular ":" lower layers - * (5) If ctx->nr > 0 => append - * We know that there's at least one regular layer - * otherwise we would've failed when parsing the previous - * lowerdir mount option. - * (5.1) The lowerdir mount option is a regular layer ":" append - * => We need to verify that no data layers have been - * specified before. - * (5.2) The lowerdir mount option is a data layer "::" append - * We know that there's at least one regular layer or - * other data layers. => There's nothing to verify. - */ - dup_iter = dup; - for (nr = ctx->nr; nr < nr_lower; nr++) { - l = &ctx->lower[nr]; + iter = dup; + l = ctx->lower; + for (nr = 0; nr < nr_lower; nr++, l++) { + ctx->nr++; memset(l, 0, sizeof(*l)); - err = ovl_mount_dir_noesc(dup_iter, &l->path); + err = ovl_mount_dir(iter, &l->path); + if (err) + goto out_put; + + err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false); if (err) goto out_put; err = -ENOMEM; - l->name = kstrdup(dup_iter, GFP_KERNEL_ACCOUNT); + l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT); if (!l->name) goto out_put; if (data_layer) - nr_data++; + ctx->nr_data++; /* Calling strchr() again would overrun. */ - if ((nr + 1) == nr_lower) + if (ctx->nr == nr_lower) break; err = -EINVAL; - dup_iter = strchr(dup_iter, '\0') + 1; - if (*dup_iter) { + iter = strchr(iter, '\0') + 1; + if (*iter) { /* * This is a regular layer so we require that * there are no data layers. */ - if ((ctx->nr_data + nr_data) > 0) { + if (ctx->nr_data > 0) { pr_err("regular lower layers cannot follow data lower layers"); goto out_put; } @@ -469,29 +523,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) /* This is a data lower layer. */ data_layer = true; - dup_iter++; + iter++; } - ctx->nr = nr_lower; - ctx->nr_data += nr_data; kfree(dup); return 0; out_put: - /* - * We know nr >= ctx->nr < nr_lower. If we failed somewhere - * we want to undo until nr == ctx->nr. This is correct for - * both ctx->nr == 0 and ctx->nr > 0. - */ - for (; nr >= ctx->nr; nr--) { - l = &ctx->lower[nr]; - kfree(l->name); - l->name = NULL; - path_put(&l->path); - - /* don't overflow */ - if (nr == 0) - break; - } + ovl_reset_lowerdirs(ctx); out_err: kfree(dup); @@ -536,11 +574,11 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_lowerdir: err = ovl_parse_param_lowerdir(param->string, fc); break; + case Opt_lowerdir_add: + case Opt_datadir_add: case Opt_upperdir: - fallthrough; case Opt_workdir: - err = ovl_parse_param_upperdir(param->string, fc, - (Opt_workdir == opt)); + err = ovl_parse_layer(fc, param, opt); break; case Opt_default_permissions: config->default_permissions = true; @@ -572,6 +610,9 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) config->metacopy = result.uint_32; ctx->set.metacopy = true; break; + case Opt_verity: + config->verity_mode = result.uint_32; + break; case Opt_volatile: config->ovl_volatile = true; break; @@ -594,7 +635,7 @@ static int ovl_get_tree(struct fs_context *fc) static inline void ovl_fs_context_free(struct ovl_fs_context *ctx) { - ovl_parse_param_drop_lowerdir(ctx); + ovl_reset_lowerdirs(ctx); path_put(&ctx->upper); path_put(&ctx->work); kfree(ctx->lower); @@ -622,7 +663,7 @@ static void ovl_free(struct fs_context *fc) static int ovl_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct super_block *upper_sb; int ret = 0; @@ -642,6 +683,7 @@ static int ovl_reconfigure(struct fs_context *fc) } static const struct fs_context_operations ovl_context_ops = { + .parse_monolithic = ovl_parse_monolithic, .parse_param = ovl_parse_param, .get_tree = ovl_get_tree, .reconfigure = ovl_reconfigure, @@ -679,7 +721,7 @@ int ovl_init_fs_context(struct fs_context *fc) ofs->config.redirect_mode = ovl_redirect_mode_def(); ofs->config.index = ovl_index_def; - ofs->config.uuid = true; + ofs->config.uuid = ovl_uuid_def(); ofs->config.nfs_export = ovl_nfs_export_def; ofs->config.xino = ovl_xino_def(); ofs->config.metacopy = ovl_metacopy_def; @@ -712,12 +754,12 @@ void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); - /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ - mounts = (struct vfsmount **) ofs->layers; + /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->config.lowerdirs; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); + kfree(ofs->config.lowerdirs[i]); mounts[i] = ofs->layers[i].mnt; - kfree(ofs->layers[i].name); } kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); @@ -725,6 +767,7 @@ void ovl_free_fs(struct ovl_fs *ofs) free_anon_bdev(ofs->fs[i].pseudo_dev); kfree(ofs->fs); + kfree(ofs->config.lowerdirs); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); if (ofs->creator_cred) @@ -762,6 +805,23 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, config->ovl_volatile = false; } + if (!config->upperdir && config->uuid == OVL_UUID_ON) { + pr_info("option \"uuid=on\" requires an upper fs, falling back to uuid=null.\n"); + config->uuid = OVL_UUID_NULL; + } + + /* Resolve verity -> metacopy dependency */ + if (config->verity_mode && !config->metacopy) { + /* Don't allow explicit specified conflicting combinations */ + if (set.metacopy) { + pr_err("conflicting options: metacopy=off,verity=%s\n", + ovl_verity_mode(config)); + return -EINVAL; + } + /* Otherwise automatically enable metacopy. */ + config->metacopy = true; + } + /* * This is to make the logic below simpler. It doesn't make any other * difference, since redirect_dir=on is only used for upper. @@ -769,13 +829,18 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW) config->redirect_mode = OVL_REDIRECT_ON; - /* Resolve metacopy -> redirect_dir dependency */ + /* Resolve verity -> metacopy -> redirect_dir dependency */ if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) { if (set.metacopy && set.redirect) { pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", ovl_redirect_mode(config)); return -EINVAL; } + if (config->verity_mode && set.redirect) { + pr_err("conflicting options: verity=%s,redirect_dir=%s\n", + ovl_verity_mode(config), ovl_redirect_mode(config)); + return -EINVAL; + } if (set.redirect) { /* * There was an explicit redirect_dir=... that resulted @@ -812,7 +877,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, } } - /* Resolve nfs_export -> !metacopy dependency */ + /* Resolve nfs_export -> !metacopy && !verity dependency */ if (config->nfs_export && config->metacopy) { if (set.nfs_export && set.metacopy) { pr_err("conflicting options: nfs_export=on,metacopy=on\n"); @@ -825,6 +890,14 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, */ pr_info("disabling nfs_export due to metacopy=on\n"); config->nfs_export = false; + } else if (config->verity_mode) { + /* + * There was an explicit verity=.. that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to verity=%s\n", + ovl_verity_mode(config)); + config->nfs_export = false; } else { /* * There was an explicit nfs_export=on that resulted @@ -836,7 +909,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, } - /* Resolve userxattr -> !redirect && !metacopy dependency */ + /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */ if (config->userxattr) { if (set.redirect && config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { @@ -848,6 +921,11 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, pr_err("conflicting options: userxattr,metacopy=on\n"); return -EINVAL; } + if (config->verity_mode) { + pr_err("conflicting options: userxattr,verity=%s\n", + ovl_verity_mode(config)); + return -EINVAL; + } /* * Silently disable default setting of redirect and metacopy. * This shall be the default in the future as well: these @@ -872,18 +950,30 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx, int ovl_show_options(struct seq_file *m, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; - struct ovl_fs *ofs = sb->s_fs_info; - size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer; - const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower]; - - /* ofs->layers[0] is the upper layer */ - seq_printf(m, ",lowerdir=%s", ofs->layers[1].name); - /* dump regular lower layers */ - for (nr = 2; nr < nr_merged_lower; nr++) - seq_printf(m, ":%s", ofs->layers[nr].name); - /* dump data lower layers */ - for (nr = 0; nr < ofs->numdatalayer; nr++) - seq_printf(m, "::%s", data_layers[nr].name); + struct ovl_fs *ofs = OVL_FS(sb); + size_t nr, nr_merged_lower, nr_lower = 0; + char **lowerdirs = ofs->config.lowerdirs; + + /* + * lowerdirs[0] holds the colon separated list that user provided + * with lowerdir mount option. + * lowerdirs[1..numlayer] hold the lowerdir paths that were added + * using the lowerdir+ and datadir+ mount options. + * For now, we do not allow mixing the legacy lowerdir mount option + * with the new lowerdir+ and datadir+ mount options. + */ + if (lowerdirs[0]) { + seq_show_option(m, "lowerdir", lowerdirs[0]); + } else { + nr_lower = ofs->numlayer; + nr_merged_lower = nr_lower - ofs->numdatalayer; + } + for (nr = 1; nr < nr_lower; nr++) { + if (nr < nr_merged_lower) + seq_show_option(m, "lowerdir+", lowerdirs[nr]); + else + seq_show_option(m, "datadir+", lowerdirs[nr]); + } if (ofs->config.upperdir) { seq_show_option(m, "upperdir", ofs->config.upperdir); seq_show_option(m, "workdir", ofs->config.workdir); @@ -895,8 +985,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) ovl_redirect_mode(&ofs->config)); if (ofs->config.index != ovl_index_def) seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); - if (!ofs->config.uuid) - seq_puts(m, ",uuid=off"); + if (ofs->config.uuid != ovl_uuid_def()) + seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config)); if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); @@ -909,5 +999,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_puts(m, ",volatile"); if (ofs->config.userxattr) seq_puts(m, ",userxattr"); + if (ofs->config.verity_mode != ovl_verity_mode_def()) + seq_printf(m, ",verity=%s", + ovl_verity_mode(&ofs->config)); return 0; } diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h index 8750da68ab2a..c96d93982021 100644 --- a/fs/overlayfs/params.h +++ b/fs/overlayfs/params.h @@ -32,6 +32,7 @@ struct ovl_fs_context { size_t nr_data; struct ovl_opt_set set; struct ovl_fs_context_layer *lower; + char *lowerdir_all; /* user provided lowerdir string */ }; int ovl_init_fs_context(struct fs_context *fc); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index de39e067ae65..a490fc47c3e7 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -25,6 +25,7 @@ struct ovl_cache_entry { struct ovl_cache_entry *next_maybe_whiteout; bool is_upper; bool is_whiteout; + bool check_xwhiteout; char name[]; }; @@ -47,6 +48,7 @@ struct ovl_readdir_data { int err; bool is_upper; bool d_type_supported; + bool in_xwhiteouts_dir; }; struct ovl_dir_file { @@ -162,6 +164,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, p->ino = 0; p->is_upper = rdd->is_upper; p->is_whiteout = false; + /* Defer check for overlay.whiteout to ovl_iterate() */ + p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; @@ -301,6 +305,8 @@ static inline int ovl_dir_read(const struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); + rdd->in_xwhiteouts_dir = rdd->dentry && + ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { @@ -447,7 +453,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, } /* - * Set d_ino for upper entries. Non-upper entries should always report + * Set d_ino for upper entries if needed. Non-upper entries should always report * the uppermost real inode ino and should not call this function. * * When not all layer are on same fs, report real ino also for upper. @@ -455,8 +461,11 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, * When all layers are on the same fs, and upper has a reference to * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). + * + * Also checks the overlay.whiteout xattr by doing a full lookup which will return + * negative in this case. */ -static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry *p) +static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino) { struct dentry *dir = path->dentry; @@ -467,7 +476,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry int xinobits = ovl_xino_bits(ofs); int err = 0; - if (!ovl_same_dev(ofs)) + if (!ovl_same_dev(ofs) && !p->check_xwhiteout) goto out; if (p->name[0] == '.') { @@ -481,6 +490,7 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry goto get; } } + /* This checks also for xwhiteouts */ this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ @@ -494,6 +504,9 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry } get: + if (!ovl_same_dev(ofs) || !update_ino) + goto out; + type = ovl_path_type(this); if (OVL_TYPE_ORIGIN(type)) { struct kstat stat; @@ -572,7 +585,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list, list_for_each_entry_safe(p, n, list, l_node) { if (strcmp(p->name, ".") != 0 && strcmp(p->name, "..") != 0) { - err = ovl_cache_update_ino(path, p); + err = ovl_cache_update(path, p, true); if (err) return err; } @@ -778,13 +791,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); if (!p->is_whiteout) { - if (!p->ino) { - err = ovl_cache_update_ino(&file->f_path, p); + if (!p->ino || p->check_xwhiteout) { + err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) goto out; } } - /* ovl_cache_update_ino() sets is_whiteout on stale entry */ + /* ovl_cache_update() sets is_whiteout on stale entry */ if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index cc8977498c48..a0967bb25003 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -32,15 +32,24 @@ static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode) { struct dentry *real = NULL, *lower; + int err; - /* It's an overlay file */ + /* + * vfs is only expected to call d_real() with NULL from d_real_inode() + * and with overlay inode from file_dentry() on an overlay file. + * + * TODO: remove @inode argument from d_real() API, remove code in this + * function that deals with non-NULL @inode and remove d_real() call + * from file_dentry(). + */ if (inode && d_inode(dentry) == inode) return dentry; + else if (inode) + goto bug; if (!d_is_reg(dentry)) { - if (!inode || inode == d_inode(dentry)) - return dentry; - goto bug; + /* d_real_inode() is only relevant for regular files */ + return dentry; } real = ovl_dentry_upper(dentry); @@ -58,7 +67,9 @@ static struct dentry *ovl_d_real(struct dentry *dentry, * uprobes on offset within the file, so lowerdata should be available * when setting the uprobe. */ - ovl_maybe_lookup_lowerdata(dentry); + err = ovl_verify_lowerdata(dentry); + if (err) + goto bug; lower = ovl_dentry_lowerdata(dentry); if (!lower) goto bug; @@ -101,8 +112,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { - struct ovl_entry *oe = OVL_E(dentry); - struct ovl_path *lowerstack = ovl_lowerstack(oe); + struct ovl_entry *oe; + struct ovl_path *lowerstack; struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; @@ -112,6 +123,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, if (!inode) return -ECHILD; + oe = OVL_I_E(inode); + lowerstack = ovl_lowerstack(oe); upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); @@ -164,6 +177,7 @@ static void ovl_free_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); kfree(oi->redirect); + kfree(oi->oe); mutex_destroy(&oi->lock); kmem_cache_free(ovl_inode_cachep, oi); } @@ -173,7 +187,7 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); - ovl_free_entry(oi->oe); + ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe)); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else @@ -182,7 +196,7 @@ static void ovl_destroy_inode(struct inode *inode) static void ovl_put_super(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); if (ofs) ovl_free_fs(ofs); @@ -191,7 +205,7 @@ static void ovl_put_super(struct super_block *sb) /* Sync real dirty inodes in upper filesystem (if it exists) */ static int ovl_sync_fs(struct super_block *sb, int wait) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); struct super_block *upper_sb; int ret; @@ -239,8 +253,9 @@ static int ovl_sync_fs(struct super_block *sb, int wait) */ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - struct dentry *root_dentry = dentry->d_sb->s_root; + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ofs = OVL_FS(sb); + struct dentry *root_dentry = sb->s_root; struct path path; int err; @@ -250,6 +265,8 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) if (!err) { buf->f_namelen = ofs->namelen; buf->f_type = OVERLAYFS_SUPER_MAGIC; + if (ovl_has_fsid(ofs)) + buf->f_fsid = uuid_to_fsid(sb->s_uuid.b); } return err; @@ -397,6 +414,7 @@ static int ovl_lower_dir(const char *name, struct path *path, pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } + ofs->nofh |= !fh_type; /* * Decoding origin file handle is required for persistent st_ino. * Without persistent st_ino, xino=auto falls back to xino=off. @@ -427,68 +445,6 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) return ok; } -static int ovl_own_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return -EOPNOTSUPP; -} - -static int ovl_own_xattr_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static int ovl_other_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - return ovl_xattr_get(dentry, inode, name, buffer, size); -} - -static int ovl_other_xattr_set(const struct xattr_handler *handler, - struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) -{ - return ovl_xattr_set(dentry, inode, name, value, size, flags); -} - -static const struct xattr_handler ovl_own_trusted_xattr_handler = { - .prefix = OVL_XATTR_TRUSTED_PREFIX, - .get = ovl_own_xattr_get, - .set = ovl_own_xattr_set, -}; - -static const struct xattr_handler ovl_own_user_xattr_handler = { - .prefix = OVL_XATTR_USER_PREFIX, - .get = ovl_own_xattr_get, - .set = ovl_own_xattr_set, -}; - -static const struct xattr_handler ovl_other_xattr_handler = { - .prefix = "", /* catch all */ - .get = ovl_other_xattr_get, - .set = ovl_other_xattr_set, -}; - -static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { - &ovl_own_trusted_xattr_handler, - &ovl_other_xattr_handler, - NULL -}; - -static const struct xattr_handler *ovl_user_xattr_handlers[] = { - &ovl_own_user_xattr_handler, - &ovl_other_xattr_handler, - NULL -}; - static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, struct inode **ptrap, const char *name) { @@ -562,11 +518,6 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_layer->idx = 0; upper_layer->fsid = 0; - err = -ENOMEM; - upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL); - if (!upper_layer->name) - goto out; - /* * Inherit SB_NOSEC flag from upperdir. * @@ -634,7 +585,7 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs) if (IS_ERR(whiteout)) goto cleanup_temp; - err = ovl_is_whiteout(whiteout); + err = ovl_upper_is_whiteout(ofs, whiteout); /* Best effort cleanup of whiteout and temp file */ if (err) @@ -770,6 +721,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->config.index = false; pr_warn("...falling back to index=off.\n"); } + if (ovl_has_fsid(ofs)) { + ofs->config.uuid = OVL_UUID_NULL; + pr_warn("...falling back to uuid=null.\n"); + } /* * xattr support is required for persistent st_ino. * Without persistent st_ino, xino=auto falls back to xino=off. @@ -815,6 +770,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->config.index = false; pr_warn("upper fs does not support file handles, falling back to index=off.\n"); } + ofs->nofh |= !fh_type; /* Check if upper fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) @@ -869,15 +825,20 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, { struct vfsmount *mnt = ovl_upper_mnt(ofs); struct dentry *indexdir; + struct dentry *origin = ovl_lowerstack(oe)->dentry; + const struct ovl_fh *fh; int err; + fh = ovl_get_origin_fh(ofs, origin); + if (IS_ERR(fh)) + return PTR_ERR(fh); + err = mnt_want_write(mnt); if (err) - return err; + goto out_free_fh; /* Verify lower root is upper root origin */ - err = ovl_verify_origin(ofs, upperpath->dentry, - ovl_lowerstack(oe)->dentry, true); + err = ovl_verify_origin_fh(ofs, upperpath->dentry, fh, true); if (err) { pr_err("failed to verify upper root origin\n"); goto out; @@ -909,9 +870,10 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, * directory entries. */ if (ovl_check_origin_xattr(ofs, ofs->indexdir)) { - err = ovl_verify_set_fh(ofs, ofs->indexdir, - OVL_XATTR_ORIGIN, - upperpath->dentry, true, false); + err = ovl_verify_origin_xattr(ofs, ofs->indexdir, + OVL_XATTR_ORIGIN, + upperpath->dentry, true, + false); if (err) pr_err("failed to verify index dir 'origin' xattr\n"); } @@ -929,6 +891,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, out: mnt_drop_write(mnt); +out_free_fh: + kfree(fh); return err; } @@ -1110,7 +1074,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, layers[ofs->numlayer].idx = ofs->numlayer; layers[ofs->numlayer].fsid = fsid; layers[ofs->numlayer].fs = &ofs->fs[fsid]; - layers[ofs->numlayer].name = l->name; + /* Store for printing lowerdir=... in ovl_show_options() */ + ofs->config.lowerdirs[ofs->numlayer] = l->name; l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; @@ -1355,8 +1320,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (!layers) goto out_err; + ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL); + if (!ofs->config.lowerdirs) { + kfree(layers); + goto out_err; + } ofs->layers = layers; - /* Layer 0 is reserved for upper even if there's no upper */ + /* + * Layer 0 is reserved for upper even if there's no upper. + * config.lowerdirs[0] is used for storing the user provided colon + * separated lowerdir string. + */ + ofs->config.lowerdirs[0] = ctx->lowerdir_all; + ctx->lowerdir_all = NULL; ofs->numlayer = 1; sb->s_stack_depth = 0; @@ -1416,9 +1392,12 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (!ovl_upper_mnt(ofs)) sb->s_flags |= SB_RDONLY; - if (!ofs->config.uuid && ofs->numfs > 1) { - pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n"); - ofs->config.uuid = true; + if (!ovl_origin_uuid(ofs) && ofs->numfs > 1) { + pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=null.\n"); + ofs->config.uuid = OVL_UUID_NULL; + } else if (ovl_has_fsid(ofs) && ovl_upper_mnt(ofs)) { + /* Use per instance persistent uuid/fsid */ + ovl_init_uuid_xattr(sb, ofs, &ctx->upper); } if (!ovl_force_readonly(ofs) && ofs->config.index) { @@ -1449,18 +1428,32 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) ofs->config.nfs_export = false; } + /* + * Support encoding decodable file handles with nfs_export=on + * and encoding non-decodable file handles with nfs_export=off + * if all layers support file handles. + */ if (ofs->config.nfs_export) sb->s_export_op = &ovl_export_operations; + else if (!ofs->nofh) + sb->s_export_op = &ovl_export_fid_operations; /* Never override disk quota limits or use reserved space */ cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); sb->s_magic = OVERLAYFS_SUPER_MAGIC; - sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : - ovl_trusted_xattr_handlers; + sb->s_xattr = ovl_xattr_handlers(ofs); sb->s_fs_info = ofs; +#ifdef CONFIG_FS_POSIX_ACL sb->s_flags |= SB_POSIXACL; - sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE; +#endif + sb->s_iflags |= SB_I_SKIP_SYNC; + /* + * Ensure that umask handling is done by the filesystems used + * for the the upper layer instead of overlayfs as that would + * lead to unexpected results. + */ + sb->s_iflags |= SB_I_NOUMASK; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); @@ -1479,7 +1472,7 @@ out_err: return err; } -static struct file_system_type ovl_fs_type = { +struct file_system_type ovl_fs_type = { .owner = THIS_MODULE, .name = "overlay", .init_fs_context = ovl_init_fs_context, diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7ef9e13c404a..c3f020ca13a8 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -10,33 +10,60 @@ #include <linux/cred.h> #include <linux/xattr.h> #include <linux/exportfs.h> +#include <linux/file.h> #include <linux/fileattr.h> #include <linux/uuid.h> #include <linux/namei.h> #include <linux/ratelimit.h> #include "overlayfs.h" +/* Get write access to upper mnt - may fail if upper sb was remounted ro */ +int ovl_get_write_access(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + return mnt_get_write_access(ovl_upper_mnt(ofs)); +} + +/* Get write access to upper sb - may block if upper sb is frozen */ +void ovl_start_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + sb_start_write(ovl_upper_mnt(ofs)->mnt_sb); +} + int ovl_want_write(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return mnt_want_write(ovl_upper_mnt(ofs)); } +void ovl_put_write_access(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + mnt_put_write_access(ovl_upper_mnt(ofs)); +} + +void ovl_end_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + sb_end_write(ovl_upper_mnt(ofs)->mnt_sb); +} + void ovl_drop_write(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); mnt_drop_write(ovl_upper_mnt(ofs)); } struct dentry *ovl_workdir(struct dentry *dentry) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); return ofs->workdir; } const struct cred *ovl_override_creds(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return override_creds(ofs->creator_cred); } @@ -54,7 +81,7 @@ int ovl_can_decode_fh(struct super_block *sb) if (!capable(CAP_DAC_READ_SEARCH)) return 0; - if (!sb->s_export_op || !sb->s_export_op->fh_to_dentry) + if (!exportfs_can_decode_fh(sb->s_export_op)) return 0; return sb->s_export_op->encode_fh ? -1 : FILEID_INO32_GEN; @@ -62,7 +89,7 @@ int ovl_can_decode_fh(struct super_block *sb) struct dentry *ovl_indexdir(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return ofs->indexdir; } @@ -70,7 +97,7 @@ struct dentry *ovl_indexdir(struct super_block *sb) /* Index all files on copy up. For now only enabled for NFS export */ bool ovl_index_all(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return ofs->config.nfs_export && ofs->config.index; } @@ -78,7 +105,7 @@ bool ovl_index_all(struct super_block *sb) /* Verify lower origin on lookup. For now only enabled for NFS export */ bool ovl_verify_lower(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(sb); return ofs->config.nfs_export && ofs->config.index; } @@ -203,7 +230,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) void ovl_path_upper(struct dentry *dentry, struct path *path) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); path->mnt = ovl_upper_mnt(ofs); path->dentry = ovl_dentry_upper(dentry); @@ -574,6 +601,16 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } +/* + * Use this over ovl_is_whiteout for upper and lower files, as it also + * handles overlay.whiteout xattr whiteout files. + */ +bool ovl_path_is_whiteout(struct ovl_fs *ofs, const struct path *path) +{ + return ovl_is_whiteout(path->dentry) || + ovl_path_check_xwhiteout_xattr(ofs, path); +} + struct file *ovl_path_open(const struct path *path, int flags) { struct inode *inode = d_inode(path->dentry); @@ -643,22 +680,36 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags) return false; } +/* + * The copy up "transaction" keeps an elevated mnt write count on upper mnt, + * but leaves taking freeze protection on upper sb to lower level helpers. + */ int ovl_copy_up_start(struct dentry *dentry, int flags) { struct inode *inode = d_inode(dentry); int err; err = ovl_inode_lock_interruptible(inode); - if (!err && ovl_already_copied_up_locked(dentry, flags)) { + if (err) + return err; + + if (ovl_already_copied_up_locked(dentry, flags)) err = 1; /* Already copied up */ - ovl_inode_unlock(inode); - } + else + err = ovl_get_write_access(dentry); + if (err) + goto out_unlock; + return 0; + +out_unlock: + ovl_inode_unlock(inode); return err; } void ovl_copy_up_end(struct dentry *dentry) { + ovl_put_write_access(dentry); ovl_inode_unlock(d_inode(dentry)); } @@ -675,6 +726,91 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path) return false; } +bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path) +{ + struct dentry *dentry = path->dentry; + int res; + + /* xattr.whiteout must be a zero size regular file */ + if (!d_is_reg(dentry) || i_size_read(d_inode(dentry)) != 0) + return false; + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUT, NULL, 0); + return res >= 0; +} + +bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path) +{ + struct dentry *dentry = path->dentry; + int res; + + /* xattr.whiteouts must be a directory */ + if (!d_is_dir(dentry)) + return false; + + res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0); + return res >= 0; +} + +/* + * Load persistent uuid from xattr into s_uuid if found, or store a new + * random generated value in s_uuid and in xattr. + */ +bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs, + const struct path *upperpath) +{ + bool set = false; + int res; + + /* Try to load existing persistent uuid */ + res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b, + UUID_SIZE); + if (res == UUID_SIZE) + return true; + + if (res != -ENODATA) + goto fail; + + /* + * With uuid=auto, if uuid xattr is found, it will be used. + * If uuid xattrs is not found, generate a persistent uuid only on mount + * of new overlays where upper root dir is not yet marked as impure. + * An upper dir is marked as impure on copy up or lookup of its subdirs. + */ + if (ofs->config.uuid == OVL_UUID_AUTO) { + res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_IMPURE, NULL, + 0); + if (res > 0) { + /* Any mount of old overlay - downgrade to uuid=null */ + ofs->config.uuid = OVL_UUID_NULL; + return true; + } else if (res == -ENODATA) { + /* First mount of new overlay - upgrade to uuid=on */ + ofs->config.uuid = OVL_UUID_ON; + } else if (res < 0) { + goto fail; + } + + } + + /* Generate overlay instance uuid */ + uuid_gen(&sb->s_uuid); + + /* Try to store persistent uuid */ + set = true; + res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b, + UUID_SIZE); + if (res == 0) + return true; + +fail: + memset(sb->s_uuid.b, 0, UUID_SIZE); + ofs->config.uuid = OVL_UUID_NULL; + pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n", + set ? "set" : "get", upperpath->dentry, res); + return false; +} + bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, enum ovl_xattr ox) { @@ -697,8 +833,11 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path, #define OVL_XATTR_IMPURE_POSTFIX "impure" #define OVL_XATTR_NLINK_POSTFIX "nlink" #define OVL_XATTR_UPPER_POSTFIX "upper" +#define OVL_XATTR_UUID_POSTFIX "uuid" #define OVL_XATTR_METACOPY_POSTFIX "metacopy" #define OVL_XATTR_PROTATTR_POSTFIX "protattr" +#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout" +#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -711,8 +850,11 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE), OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK), OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID), OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS), }; int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, @@ -836,7 +978,7 @@ int ovl_set_protattr(struct inode *inode, struct dentry *upper, return 0; } -/** +/* * Caller must hold a reference to inode to prevent it from being freed while * it is marked inuse. */ @@ -911,12 +1053,18 @@ static void ovl_cleanup_index(struct dentry *dentry) struct dentry *index = NULL; struct inode *inode; struct qstr name = { }; + bool got_write = false; int err; err = ovl_get_index_name(ofs, lowerdentry, &name); if (err) goto fail; + err = ovl_want_write(dentry); + if (err) + goto fail; + + got_write = true; inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", @@ -954,6 +1102,8 @@ static void ovl_cleanup_index(struct dentry *dentry) goto fail; out: + if (got_write) + ovl_drop_write(dentry); kfree(name.name); dput(index); return; @@ -1000,8 +1150,12 @@ int ovl_nlink_start(struct dentry *dentry) if (err) return err; + err = ovl_want_write(dentry); + if (err) + goto out_unlock; + if (d_is_dir(dentry) || !ovl_test_flag(OVL_INDEX, inode)) - goto out; + return 0; old_cred = ovl_override_creds(dentry->d_sb); /* @@ -1012,10 +1166,15 @@ int ovl_nlink_start(struct dentry *dentry) */ err = ovl_set_nlink_upper(dentry); revert_creds(old_cred); - -out: if (err) - ovl_inode_unlock(inode); + goto out_drop_write; + + return 0; + +out_drop_write: + ovl_drop_write(dentry); +out_unlock: + ovl_inode_unlock(inode); return err; } @@ -1024,6 +1183,8 @@ void ovl_nlink_end(struct dentry *dentry) { struct inode *inode = d_inode(dentry); + ovl_drop_write(dentry); + if (ovl_test_flag(OVL_INDEX, inode) && inode->i_nlink == 0) { const struct cred *old_cred; @@ -1054,8 +1215,12 @@ err: return -EIO; } -/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */ -int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) +/* + * err < 0, 0 if no metacopy xattr, metacopy data size if xattr found. + * an empty xattr returns OVL_METACOPY_MIN_SIZE to distinguish from no xattr value. + */ +int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path, + struct ovl_metacopy *data) { int res; @@ -1063,7 +1228,8 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) if (!S_ISREG(d_inode(path->dentry)->i_mode)) return 0; - res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0); + res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, + data, data ? OVL_METACOPY_MAX_SIZE : 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return 0; @@ -1077,12 +1243,48 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path) goto out; } - return 1; + if (res == 0) { + /* Emulate empty data for zero size metacopy xattr */ + res = OVL_METACOPY_MIN_SIZE; + if (data) { + memset(data, 0, res); + data->len = res; + } + } else if (res < OVL_METACOPY_MIN_SIZE) { + pr_warn_ratelimited("metacopy file '%pd' has too small xattr\n", + path->dentry); + return -EIO; + } else if (data) { + if (data->version != 0) { + pr_warn_ratelimited("metacopy file '%pd' has unsupported version\n", + path->dentry); + return -EIO; + } + if (res != data->len) { + pr_warn_ratelimited("metacopy file '%pd' has invalid xattr size\n", + path->dentry); + return -EIO; + } + } + + return res; out: pr_warn_ratelimited("failed to get metacopy (%i)\n", res); return res; } +int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy) +{ + size_t len = metacopy->len; + + /* If no flags or digest fall back to empty metacopy file */ + if (metacopy->version == 0 && metacopy->flags == 0 && metacopy->digest_algo == 0) + len = 0; + + return ovl_check_setxattr(ofs, d, OVL_XATTR_METACOPY, + metacopy, len, -EOPNOTSUPP); +} + bool ovl_is_metacopy_dentry(struct dentry *dentry) { struct ovl_entry *oe = OVL_E(dentry); @@ -1145,6 +1347,112 @@ err_free: return ERR_PTR(res); } +/* Call with mounter creds as it may open the file */ +int ovl_ensure_verity_loaded(struct path *datapath) +{ + struct inode *inode = d_inode(datapath->dentry); + struct file *filp; + + if (!fsverity_active(inode) && IS_VERITY(inode)) { + /* + * If this inode was not yet opened, the verity info hasn't been + * loaded yet, so we need to do that here to force it into memory. + */ + filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred()); + if (IS_ERR(filp)) + return PTR_ERR(filp); + fput(filp); + } + + return 0; +} + +int ovl_validate_verity(struct ovl_fs *ofs, + struct path *metapath, + struct path *datapath) +{ + struct ovl_metacopy metacopy_data; + u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE]; + int xattr_digest_size, digest_size; + int xattr_size, err; + u8 verity_algo; + + if (!ofs->config.verity_mode || + /* Verity only works on regular files */ + !S_ISREG(d_inode(metapath->dentry)->i_mode)) + return 0; + + xattr_size = ovl_check_metacopy_xattr(ofs, metapath, &metacopy_data); + if (xattr_size < 0) + return xattr_size; + + if (!xattr_size || !metacopy_data.digest_algo) { + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", + metapath->dentry); + return -EIO; + } + return 0; + } + + xattr_digest_size = ovl_metadata_digest_size(&metacopy_data); + + err = ovl_ensure_verity_loaded(datapath); + if (err < 0) { + pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n", + datapath->dentry); + return -EIO; + } + + digest_size = fsverity_get_digest(d_inode(datapath->dentry), actual_digest, + &verity_algo, NULL); + if (digest_size == 0) { + pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", datapath->dentry); + return -EIO; + } + + if (xattr_digest_size != digest_size || + metacopy_data.digest_algo != verity_algo || + memcmp(metacopy_data.digest, actual_digest, xattr_digest_size) != 0) { + pr_warn_ratelimited("lower file '%pd' has the wrong fs-verity digest\n", + datapath->dentry); + return -EIO; + } + + return 0; +} + +int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, + struct ovl_metacopy *metacopy) +{ + int err, digest_size; + + if (!ofs->config.verity_mode || !S_ISREG(d_inode(src->dentry)->i_mode)) + return 0; + + err = ovl_ensure_verity_loaded(src); + if (err < 0) { + pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n", + src->dentry); + return -EIO; + } + + digest_size = fsverity_get_digest(d_inode(src->dentry), + metacopy->digest, &metacopy->digest_algo, NULL); + if (digest_size == 0 || + WARN_ON_ONCE(digest_size > FS_VERITY_MAX_DIGEST_SIZE)) { + if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { + pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", + src->dentry); + return -EIO; + } + return 0; + } + + metacopy->len += digest_size; + return 0; +} + /* * ovl_sync_status() - Check fs sync status for volatile mounts * @@ -1194,14 +1502,16 @@ void ovl_copyattr(struct inode *inode) realinode = ovl_i_path_real(inode, &realpath); real_idmap = mnt_idmap(realpath.mnt); + spin_lock(&inode->i_lock); vfsuid = i_uid_into_vfsuid(real_idmap, realinode); vfsgid = i_gid_into_vfsgid(real_idmap, realinode); inode->i_uid = vfsuid_into_kuid(vfsuid); inode->i_gid = vfsgid_into_kgid(vfsgid); inode->i_mode = realinode->i_mode; - inode->i_atime = realinode->i_atime; - inode->i_mtime = realinode->i_mtime; - inode->i_ctime = realinode->i_ctime; + inode_set_atime_to_ts(inode, inode_get_atime(realinode)); + inode_set_mtime_to_ts(inode, inode_get_mtime(realinode)); + inode_set_ctime_to_ts(inode, inode_get_ctime(realinode)); i_size_write(inode, i_size_read(realinode)); + spin_unlock(&inode->i_lock); } diff --git a/fs/overlayfs/xattrs.c b/fs/overlayfs/xattrs.c new file mode 100644 index 000000000000..383978e4663c --- /dev/null +++ b/fs/overlayfs/xattrs.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/fs.h> +#include <linux/xattr.h> +#include "overlayfs.h" + +static bool ovl_is_escaped_xattr(struct super_block *sb, const char *name) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + if (ofs->config.userxattr) + return strncmp(name, OVL_XATTR_ESCAPE_USER_PREFIX, + OVL_XATTR_ESCAPE_USER_PREFIX_LEN) == 0; + else + return strncmp(name, OVL_XATTR_ESCAPE_TRUSTED_PREFIX, + OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN - 1) == 0; +} + +static bool ovl_is_own_xattr(struct super_block *sb, const char *name) +{ + struct ovl_fs *ofs = OVL_FS(sb); + + if (ofs->config.userxattr) + return strncmp(name, OVL_XATTR_USER_PREFIX, + OVL_XATTR_USER_PREFIX_LEN) == 0; + else + return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, + OVL_XATTR_TRUSTED_PREFIX_LEN) == 0; +} + +bool ovl_is_private_xattr(struct super_block *sb, const char *name) +{ + return ovl_is_own_xattr(sb, name) && !ovl_is_escaped_xattr(sb, name); +} + +static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); + struct path realpath; + const struct cred *old_cred; + + if (!value && !upperdentry) { + ovl_path_lower(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); + revert_creds(old_cred); + if (err < 0) + goto out; + } + + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out; + + realdentry = ovl_dentry_upper(dentry); + } + + err = ovl_want_write(dentry); + if (err) + goto out; + + old_cred = ovl_override_creds(dentry->d_sb); + if (value) { + err = ovl_do_setxattr(ofs, realdentry, name, value, size, + flags); + } else { + WARN_ON(flags != XATTR_REPLACE); + err = ovl_do_removexattr(ofs, realdentry, name); + } + revert_creds(old_cred); + ovl_drop_write(dentry); + + /* copy c/mtime */ + ovl_copyattr(inode); +out: + return err; +} + +static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, + void *value, size_t size) +{ + ssize_t res; + const struct cred *old_cred; + struct path realpath; + + ovl_i_path_real(inode, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); + revert_creds(old_cred); + return res; +} + +static bool ovl_can_list(struct super_block *sb, const char *s) +{ + /* Never list private (.overlay) */ + if (ovl_is_private_xattr(sb, s)) + return false; + + /* List all non-trusted xattrs */ + if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + return true; + + /* list other trusted for superuser only */ + return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + struct dentry *realdentry = ovl_dentry_real(dentry); + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + ssize_t res; + size_t len; + char *s; + const struct cred *old_cred; + size_t prefix_len, name_len; + + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_listxattr(realdentry, list, size); + revert_creds(old_cred); + if (res <= 0 || size == 0) + return res; + + prefix_len = ofs->config.userxattr ? + OVL_XATTR_USER_PREFIX_LEN : OVL_XATTR_TRUSTED_PREFIX_LEN; + + /* filter out private xattrs */ + for (s = list, len = res; len;) { + size_t slen = strnlen(s, len) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > len)) + return -EIO; + + len -= slen; + if (!ovl_can_list(dentry->d_sb, s)) { + res -= slen; + memmove(s, s + slen, len); + } else if (ovl_is_escaped_xattr(dentry->d_sb, s)) { + res -= OVL_XATTR_ESCAPE_PREFIX_LEN; + name_len = slen - prefix_len - OVL_XATTR_ESCAPE_PREFIX_LEN; + s += prefix_len; + memmove(s, s + OVL_XATTR_ESCAPE_PREFIX_LEN, name_len + len); + s += name_len; + } else { + s += slen; + } + } + + return res; +} + +static char *ovl_xattr_escape_name(const char *prefix, const char *name) +{ + size_t prefix_len = strlen(prefix); + size_t name_len = strlen(name); + size_t escaped_len; + char *escaped, *s; + + escaped_len = prefix_len + OVL_XATTR_ESCAPE_PREFIX_LEN + name_len; + if (escaped_len > XATTR_NAME_MAX) + return ERR_PTR(-EOPNOTSUPP); + + escaped = kmalloc(escaped_len + 1, GFP_KERNEL); + if (escaped == NULL) + return ERR_PTR(-ENOMEM); + + s = escaped; + memcpy(s, prefix, prefix_len); + s += prefix_len; + memcpy(s, OVL_XATTR_ESCAPE_PREFIX, OVL_XATTR_ESCAPE_PREFIX_LEN); + s += OVL_XATTR_ESCAPE_PREFIX_LEN; + memcpy(s, name, name_len + 1); + + return escaped; +} + +static int ovl_own_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + char *escaped; + int r; + + escaped = ovl_xattr_escape_name(handler->prefix, name); + if (IS_ERR(escaped)) + return PTR_ERR(escaped); + + r = ovl_xattr_get(dentry, inode, escaped, buffer, size); + + kfree(escaped); + + return r; +} + +static int ovl_own_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + char *escaped; + int r; + + escaped = ovl_xattr_escape_name(handler->prefix, name); + if (IS_ERR(escaped)) + return PTR_ERR(escaped); + + r = ovl_xattr_set(dentry, inode, escaped, value, size, flags); + + kfree(escaped); + + return r; +} + +static int ovl_other_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + return ovl_xattr_get(dentry, inode, name, buffer, size); +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_xattr_set(dentry, inode, name, value, size, flags); +} + +static const struct xattr_handler ovl_own_trusted_xattr_handler = { + .prefix = OVL_XATTR_TRUSTED_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_own_user_xattr_handler = { + .prefix = OVL_XATTR_USER_PREFIX, + .get = ovl_own_xattr_get, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_other_xattr_handler = { + .prefix = "", /* catch all */ + .get = ovl_other_xattr_get, + .set = ovl_other_xattr_set, +}; + +static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = { + &ovl_own_trusted_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +static const struct xattr_handler * const ovl_user_xattr_handlers[] = { + &ovl_own_user_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs) +{ + return ofs->config.userxattr ? ovl_user_xattr_handlers : + ovl_trusted_xattr_handlers; +} + diff --git a/fs/pipe.c b/fs/pipe.c index 2d88f73f585a..804a7d789452 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe) return !pipe_empty(head, tail) || !writers; } +static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + unsigned int tail) +{ + pipe_buf_release(pipe, buf); + + /* + * If the pipe has a watch_queue, we need additional protection + * by the spinlock because notifications get posted with only + * this spinlock, no mutex + */ + if (pipe_has_watch_queue(pipe)) { + spin_lock_irq(&pipe->rd_wait.lock); +#ifdef CONFIG_WATCH_QUEUE + if (buf->flags & PIPE_BUF_FLAG_LOSS) + pipe->note_loss = true; +#endif + pipe->tail = ++tail; + spin_unlock_irq(&pipe->rd_wait.lock); + return tail; + } + + /* + * Without a watch_queue, we can simply increment the tail + * without the spinlock - the mutex is enough. + */ + pipe->tail = ++tail; + return tail; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { @@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) buf->len = 0; } - if (!buf->len) { - pipe_buf_release(pipe, buf); - spin_lock_irq(&pipe->rd_wait.lock); -#ifdef CONFIG_WATCH_QUEUE - if (buf->flags & PIPE_BUF_FLAG_LOSS) - pipe->note_loss = true; -#endif - tail++; - pipe->tail = tail; - spin_unlock_irq(&pipe->rd_wait.lock); - } + if (!buf->len) + tail = pipe_update_tail(pipe, buf, tail); total_len -= chars; if (!total_len) break; /* common path: read succeeded */ @@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe_has_watch_queue(pipe)) { ret = -EXDEV; goto out; } -#endif /* * If it wasn't empty we try to merge new data into @@ -489,7 +508,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; @@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * it, either the reader will consume it or it'll still * be there for the next write. */ - spin_lock_irq(&pipe->rd_wait.lock); - - head = pipe->head; - if (pipe_full(head, pipe->tail, pipe->max_usage)) { - spin_unlock_irq(&pipe->rd_wait.lock); - continue; - } - pipe->head = head + 1; - spin_unlock_irq(&pipe->rd_wait.lock); /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; @@ -537,7 +547,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) break; } ret += copied; - buf->offset = 0; buf->len = copied; if (!iov_iter_count(from)) @@ -855,7 +864,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) kfree(pipe); } -static struct vfsmount *pipe_mnt __read_mostly; +static struct vfsmount *pipe_mnt __ro_after_init; /* * pipefs_dname() is called from d_path(). @@ -899,7 +908,7 @@ static struct inode * get_pipe_inode(void) inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); return inode; @@ -1236,7 +1245,7 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -unsigned int round_pipe_size(unsigned long size) +unsigned int round_pipe_size(unsigned int size) { if (size > (1U << 31)) return 0; @@ -1319,16 +1328,14 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) { unsigned long user_bufs; unsigned int nr_slots, size; long ret = 0; -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) + if (pipe_has_watch_queue(pipe)) return -EBUSY; -#endif size = round_pipe_size(arg); nr_slots = size >> PAGE_SHIFT; @@ -1380,14 +1387,12 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) if (file->f_op != &pipefifo_fops || !pipe) return NULL; -#ifdef CONFIG_WATCH_QUEUE - if (for_splice && pipe->watch_queue) + if (for_splice && pipe_has_watch_queue(pipe)) return NULL; -#endif return pipe; } -long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { struct pipe_inode_info *pipe; long ret; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 7fa1b738bbab..a05fe94970ce 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1027,7 +1027,7 @@ int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, return error; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); set_cached_acl(inode, type, acl); diff --git a/fs/proc/array.c b/fs/proc/array.c index d35bbf35a874..ff08a8957552 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -431,6 +431,11 @@ static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm) seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm)); } +__weak void arch_proc_pid_thread_features(struct seq_file *m, + struct task_struct *task) +{ +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -455,6 +460,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); task_context_switch_counts(m, task); + arch_proc_pid_thread_features(m, task); return 0; } @@ -530,12 +536,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* add up live thread stats at the group level */ if (whole) { - struct task_struct *t = task; - do { + struct task_struct *t; + + __for_each_thread(sig, t) { min_flt += t->min_flt; maj_flt += t->maj_flt; gtime += task_gtime(t); - } while_each_thread(task, t); + } min_flt += sig->min_flt; maj_flt += sig->maj_flt; diff --git a/fs/proc/base.c b/fs/proc/base.c index 9df3f4839662..dd31e3b6bf77 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1153,11 +1153,10 @@ err_unlock: static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1213,11 +1212,10 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int oom_score_adj; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) { @@ -1358,13 +1356,13 @@ static ssize_t proc_fault_inject_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int make_it_fail; int rv; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1509,11 +1507,10 @@ sched_autogroup_write(struct file *file, const char __user *buf, { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; int nice; int err; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1666,10 +1663,9 @@ static ssize_t comm_write(struct file *file, const char __user *buf, { struct inode *inode = file_inode(file); struct task_struct *p; - char buffer[TASK_COMM_LEN]; + char buffer[TASK_COMM_LEN] = {}; const size_t maxlen = sizeof(buffer) - 1; - memset(buffer, 0, sizeof(buffer)); if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) return -EFAULT; @@ -1902,7 +1898,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb, ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_op = &proc_def_inode_operations; /* @@ -1966,7 +1962,7 @@ int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -2218,7 +2214,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } @@ -2976,8 +2972,7 @@ static const struct file_operations proc_coredump_filter_operations = { #ifdef CONFIG_TASK_IO_ACCOUNTING static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) { - struct task_io_accounting acct = task->ioac; - unsigned long flags; + struct task_io_accounting acct; int result; result = down_read_killable(&task->signal->exec_update_lock); @@ -2989,15 +2984,28 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh goto out_unlock; } - if (whole && lock_task_sighand(task, &flags)) { - struct task_struct *t = task; + if (whole) { + struct signal_struct *sig = task->signal; + struct task_struct *t; + unsigned int seq = 1; + unsigned long flags; + + rcu_read_lock(); + do { + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); - task_io_accounting_add(&acct, &task->signal->ioac); - while_each_thread(task, t) - task_io_accounting_add(&acct, &t->ioac); + acct = sig->ioac; + __for_each_thread(sig, t) + task_io_accounting_add(&acct, &t->ioac); - unlock_task_sighand(task, &flags); + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); + rcu_read_unlock(); + } else { + acct = task->ioac; } + seq_printf(m, "rchar: %llu\n" "wchar: %llu\n" @@ -3207,6 +3215,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); @@ -3583,7 +3592,8 @@ static int proc_tid_comm_permission(struct mnt_idmap *idmap, } static const struct inode_operations proc_tid_comm_inode_operations = { - .permission = proc_tid_comm_permission, + .setattr = proc_setattr, + .permission = proc_tid_comm_permission, }; /* @@ -3813,11 +3823,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, /* If we haven't found our starting place yet start * with the leader and walk nr threads forward. */ - pos = task = task->group_leader; - do { + for_each_thread(task, pos) { if (!nr--) goto found; - } while_each_thread(task, pos); + } fail: pos = NULL; goto out; @@ -3839,10 +3848,8 @@ static struct task_struct *next_tid(struct task_struct *start) struct task_struct *pos = NULL; rcu_read_lock(); if (pid_alive(start)) { - pos = next_thread(start); - if (thread_group_leader(pos)) - pos = NULL; - else + pos = __next_thread(start); + if (pos) get_task_struct(pos); } rcu_read_unlock(); @@ -3899,7 +3906,7 @@ static int proc_task_getattr(struct mnt_idmap *idmap, { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 2e244ada1f97..902b326e1e56 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -62,6 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; } + if (ret >= 0 && boot_command_line[0]) { + ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n", + boot_command_line); + if (ret > 0) + dst += ret; + } } out: kfree(key); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index b3140deebbbf..6e72e5ad42bc 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, fd); - if (file) - *mode = file->f_mode; + file = task_lookup_fdget_rcu(task, fd); rcu_read_unlock(); + if (file) { + *mode = file->f_mode; + fput(file); + } return !!file; } @@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, char name[10 + 1]; unsigned int len; - f = task_lookup_next_fd_rcu(p, &fd); + f = task_lookup_next_fdget_rcu(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; rcu_read_unlock(); + fput(f); data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); @@ -352,7 +355,7 @@ static int proc_fd_getattr(struct mnt_idmap *idmap, struct inode *inode = d_inode(path->dentry); int rv = 0; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* If it's a directory, put the number of open fds there */ if (S_ISDIR(inode->i_mode)) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 42ae38ff6e7e..775ce0bcf08c 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -146,7 +146,7 @@ static int proc_getattr(struct mnt_idmap *idmap, } } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 67b09a1d9433..b33e490e3fd9 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -110,18 +110,15 @@ void __init proc_init_kmemcache(void) void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { - struct inode *inode; - struct proc_inode *ei; struct hlist_node *node; struct super_block *old_sb = NULL; rcu_read_lock(); - for (;;) { + while ((node = hlist_first_rcu(inodes))) { + struct proc_inode *ei = hlist_entry(node, struct proc_inode, sibling_inodes); struct super_block *sb; - node = hlist_first_rcu(inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sibling_inodes); + struct inode *inode; + spin_lock(lock); hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(lock); @@ -660,7 +657,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_private = de->data; inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0..9a8f32f21ff5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -289,9 +289,7 @@ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; -#ifdef CONFIG_MMU struct vma_iterator iter; -#endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 23fc24d16b31..6422e569b080 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -546,7 +546,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) * and explicitly excluded physical ranges. */ if (!page || PageOffline(page) || - is_page_hwpoison(page) || !pfn_is_ram(pfn)) { + is_page_hwpoison(page) || !pfn_is_ram(pfn) || + pfn_is_unaccepted_memory(pfn)) { if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8dca4d6d96c7..45af9a989d40 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -17,6 +17,7 @@ #ifdef CONFIG_CMA #include <linux/cma.h> #endif +#include <linux/zswap.h> #include <asm/page.h> #include "internal.h" @@ -132,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); -#ifdef CONFIG_MEMTEST - if (early_memtest_done) { - unsigned long early_memtest_bad_size_kb; - - early_memtest_bad_size_kb = early_memtest_bad_size>>10; - if (early_memtest_bad_size && !early_memtest_bad_size_kb) - early_memtest_bad_size_kb = 1; - /* When 0 is reported, it means there actually was a successful test */ - seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); - } -#endif + memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 4d3493579458..c6e7ebc63756 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a0c0419872e3..2ba31b6d68c0 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap, net = get_proc_task_net(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; @@ -321,6 +321,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, + .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5ea42653126e..8064ea76f80b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -19,8 +19,9 @@ #include <linux/kmemleak.h> #include "internal.h" -#define list_for_each_table_entry(entry, table) \ - for ((entry) = (table); (entry)->procname; (entry)++) +#define list_for_each_table_entry(entry, header) \ + entry = header->ctl_table; \ + for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; @@ -43,7 +44,7 @@ static struct ctl_table sysctl_mount_point[] = { */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { - return register_sysctl(path, sysctl_mount_point); + return register_sysctl_sz(path, sysctl_mount_point, 0); } EXPORT_SYMBOL(register_sysctl_mount_point); @@ -188,9 +189,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry) static void init_header(struct ctl_table_header *head, struct ctl_table_root *root, struct ctl_table_set *set, - struct ctl_node *node, struct ctl_table *table) + struct ctl_node *node, struct ctl_table *table, size_t table_size) { head->ctl_table = table; + head->ctl_table_size = table_size; head->ctl_table_arg = table; head->used = 0; head->count = 1; @@ -204,7 +206,7 @@ static void init_header(struct ctl_table_header *head, if (node) { struct ctl_table *entry; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { node->header = head; node++; } @@ -215,7 +217,7 @@ static void erase_header(struct ctl_table_header *head) { struct ctl_table *entry; - list_for_each_table_entry(entry, head->ctl_table) + list_for_each_table_entry(entry, head) erase_entry(head, entry); } @@ -242,7 +244,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) err = insert_links(header); if (err) goto fail_links; - list_for_each_table_entry(entry, header->ctl_table) { + list_for_each_table_entry(entry, header) { err = insert_entry(header, entry); if (err) goto fail; @@ -463,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, head->count++; spin_unlock(&sysctl_lock); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; @@ -849,7 +851,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; @@ -973,7 +975,7 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; - init_header(&new->header, set->dir.header.root, set, node, table); + init_header(&new->header, set->dir.header.root, set, node, table, 1); return new; } @@ -1125,11 +1127,11 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) return err; } -static int sysctl_check_table(const char *path, struct ctl_table *table) +static int sysctl_check_table(const char *path, struct ctl_table_header *header) { struct ctl_table *entry; int err = 0; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || @@ -1159,8 +1161,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) return err; } -static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table, - struct ctl_table_root *link_root) +static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head) { struct ctl_table *link_table, *entry, *link; struct ctl_table_header *links; @@ -1170,7 +1171,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table name_bytes = 0; nr_entries = 0; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { nr_entries++; name_bytes += strlen(entry->procname) + 1; } @@ -1189,31 +1190,33 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table link_name = (char *)&link_table[nr_entries + 1]; link = link_table; - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, head) { int len = strlen(entry->procname) + 1; memcpy(link_name, entry->procname, len); link->procname = link_name; link->mode = S_IFLNK|S_IRWXUGO; - link->data = link_root; + link->data = head->root; link_name += len; link++; } - init_header(links, dir->header.root, dir->header.set, node, link_table); + init_header(links, dir->header.root, dir->header.set, node, link_table, + head->ctl_table_size); links->nreg = nr_entries; return links; } static bool get_links(struct ctl_dir *dir, - struct ctl_table *table, struct ctl_table_root *link_root) + struct ctl_table_header *header, + struct ctl_table_root *link_root) { - struct ctl_table_header *head; + struct ctl_table_header *tmp_head; struct ctl_table *entry, *link; /* Are there links available for every entry in table? */ - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); + link = find_entry(&tmp_head, dir, procname, strlen(procname)); if (!link) return false; if (S_ISDIR(link->mode) && S_ISDIR(entry->mode)) @@ -1224,10 +1227,10 @@ static bool get_links(struct ctl_dir *dir, } /* The checks passed. Increase the registration count on the links */ - list_for_each_table_entry(entry, table) { + list_for_each_table_entry(entry, header) { const char *procname = entry->procname; - link = find_entry(&head, dir, procname, strlen(procname)); - head->nreg++; + link = find_entry(&tmp_head, dir, procname, strlen(procname)); + tmp_head->nreg++; } return true; } @@ -1246,13 +1249,13 @@ static int insert_links(struct ctl_table_header *head) if (IS_ERR(core_parent)) return 0; - if (get_links(core_parent, head->ctl_table, head->root)) + if (get_links(core_parent, head, head->root)) return 0; core_parent->header.nreg++; spin_unlock(&sysctl_lock); - links = new_links(core_parent, head->ctl_table, head->root); + links = new_links(core_parent, head); spin_lock(&sysctl_lock); err = -ENOMEM; @@ -1260,7 +1263,7 @@ static int insert_links(struct ctl_table_header *head) goto out; err = 0; - if (get_links(core_parent, head->ctl_table, head->root)) { + if (get_links(core_parent, head, head->root)) { kfree(links); goto out; } @@ -1310,6 +1313,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) * should not be free'd after registration. So it should not be * used on stack. It can either be a global or dynamically allocated * by the caller and free'd later after sysctl unregistration. + * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. @@ -1352,26 +1356,21 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) */ struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, - const char *path, struct ctl_table *table) + const char *path, struct ctl_table *table, size_t table_size) { struct ctl_table_root *root = set->dir.header.root; struct ctl_table_header *header; struct ctl_dir *dir; - struct ctl_table *entry; struct ctl_node *node; - int nr_entries = 0; - - list_for_each_table_entry(entry, table) - nr_entries++; header = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT); + sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT); if (!header) return NULL; node = (struct ctl_node *)(header + 1); - init_header(header, root, set, node, table); - if (sysctl_check_table(path, table)) + init_header(header, root, set, node, table, table_size); + if (sysctl_check_table(path, header)) goto fail; spin_lock(&sysctl_lock); @@ -1401,7 +1400,7 @@ fail: } /** - * register_sysctl - register a sysctl table + * register_sysctl_sz - register a sysctl table * @path: The path to the directory the sysctl table is in. If the path * doesn't exist we will create it for you. * @table: the table structure. The calller must ensure the life of the @table @@ -1411,18 +1410,20 @@ fail: * to call unregister_sysctl_table() and can instead use something like * register_sysctl_init() which does not care for the result of the syctl * registration. + * @table_size: The number of elements in table. * * Register a sysctl table. @table should be a filled in ctl_table * array. A completely 0 filled entry terminates the table. * * See __register_sysctl_table for more details. */ -struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table) +struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table, + size_t table_size) { return __register_sysctl_table(&sysctl_table_root.default_set, - path, table); + path, table, table_size); } -EXPORT_SYMBOL(register_sysctl); +EXPORT_SYMBOL(register_sysctl_sz); /** * __register_sysctl_init() - register sysctl table to path @@ -1433,6 +1434,7 @@ EXPORT_SYMBOL(register_sysctl); * lifetime use of the sysctl. * @table_name: The name of sysctl table, only used for log printing when * registration fails + * @table_size: The number of elements in table * * The sysctl interface is used by userspace to query or modify at runtime * a predefined value set on a variable. These variables however have default @@ -1445,12 +1447,12 @@ EXPORT_SYMBOL(register_sysctl); * Context: if your base directory does not exist it will be created for you. */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, - const char *table_name) + const char *table_name, size_t table_size) { - struct ctl_table_header *hdr = register_sysctl(path, table); + struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size); if (unlikely(!hdr)) { - pr_err("failed when register_sysctl %s to %s\n", table_name, path); + pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path); return; } kmemleak_not_leak(hdr); @@ -1471,7 +1473,7 @@ static void put_links(struct ctl_table_header *header) if (IS_ERR(core_parent)) return; - list_for_each_table_entry(entry, header->ctl_table) { + list_for_each_table_entry(entry, header) { struct ctl_table_header *link_head; struct ctl_table *link; const char *name = entry->procname; @@ -1535,7 +1537,7 @@ void setup_sysctl_set(struct ctl_table_set *set, { memset(set, 0, sizeof(*set)); set->is_seen = is_seen; - init_header(&set->dir.header, root, set, NULL, root_table); + init_header(&set->dir.header, root, set, NULL, root_table, 1); } void retire_sysctl_set(struct ctl_table_set *set) @@ -1574,7 +1576,6 @@ static const struct sysctl_alias sysctl_aliases[] = { {"hung_task_panic", "kernel.hung_task_panic" }, {"numa_zonelist_order", "vm.numa_zonelist_order" }, {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, - {"softlockup_panic", "kernel.softlockup_panic" }, { } }; @@ -1590,6 +1591,13 @@ static const char *sysctl_find_alias(char *param) return NULL; } +bool sysctl_is_alias(char *param) +{ + const char *alias = sysctl_find_alias(param); + + return alias != NULL; +} + /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) diff --git a/fs/proc/root.c b/fs/proc/root.c index a86e65a608da..b55dbc70287b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; + s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); @@ -314,7 +314,8 @@ static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 72cd69bcaf4a..b46fbfd22681 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fafff1bd34cd..435b61054b5b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -4,6 +4,7 @@ #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/mount.h> +#include <linux/ksm.h> #include <linux/seq_file.h> #include <linux/highmem.h> #include <linux/ptrace.h> @@ -19,6 +20,8 @@ #include <linux/shmem_fs.h> #include <linux/uaccess.h> #include <linux/pkeys.h> +#include <linux/minmax.h> +#include <linux/overflow.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -236,21 +239,6 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -/* - * Indicate if the VMA is a stack for the given task; for - * /proc/PID/maps that is the stack of the main task. - */ -static int is_stack(struct vm_area_struct *vma) -{ - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; -} - static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, @@ -310,7 +298,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (anon_name) seq_printf(m, "[anon_shmem:%s]", anon_name->name); else - seq_file_path(m, file, "\n"); + seq_path(m, file_user_path(file), "\n"); goto done; } @@ -327,13 +315,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (vma->vm_start <= mm->brk && - vma->vm_end >= mm->start_brk) { + if (vma_is_initial_heap(vma)) { name = "[heap]"; goto done; } - if (is_stack(vma)) { + if (vma_is_initial_stack(vma)) { name = "[stack]"; goto done; } @@ -412,6 +399,7 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long ksm; u64 pss; u64 pss_anon; u64 pss_file; @@ -468,6 +456,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->lazyfree += size; } + if (PageKsm(page)) + mss->ksm += size; + mss->resident += size; /* Accumulate the size in pages that have been accessed. */ if (young || page_is_young(page) || PageReferenced(page)) @@ -708,6 +699,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR [ilog2(VM_UFFD_MINOR)] = "ui", #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +#ifdef CONFIG_X86_USER_SHADOW_STACK + [ilog2(VM_SHADOW_STACK)] = "ss", +#endif }; size_t i; @@ -838,6 +832,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm); SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); @@ -856,9 +851,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; - struct mem_size_stats mss; - - memset(&mss, 0, sizeof(mss)); + struct mem_size_stats mss = {}; smap_gather_stats(vma, &mss, 0); @@ -871,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", + seq_printf(m, "THPeligible: %8u\n", hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) @@ -884,7 +877,7 @@ static int show_smap(struct seq_file *m, void *v) static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mem_size_stats mss; + struct mem_size_stats mss = {}; struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; @@ -900,8 +893,6 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_task; } - memset(&mss, 0, sizeof(mss)); - ret = mmap_read_lock_killable(mm); if (ret) goto out_put_mm; @@ -1253,14 +1244,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task; - char buffer[PROC_NUMBUF]; + char buffer[PROC_NUMBUF] = {}; struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; int itype; int rv; - memset(buffer, 0, sizeof(buffer)); if (count > sizeof(buffer) - 1) count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) @@ -1768,11 +1758,753 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE) +#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) + +struct pagemap_scan_private { + struct pm_scan_arg arg; + unsigned long masks_of_interest, cur_vma_category; + struct page_region *vec_buf; + unsigned long vec_buf_len, vec_buf_index, found_pages; + struct page_region __user *vec_out; +}; + +static unsigned long pagemap_page_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + unsigned long categories = 0; + + if (pte_present(pte)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page(vma, addr, pte); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pte_to_swp_entry(pte); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent)) { + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_mkuffd_wp(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } else { + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pmd_present(pmd)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pmd_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pmd_pfn(pmd))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pmd(pmd)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pmd_swp_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pmd_to_swp_entry(pmd); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old, pmd = *pmdp; + + if (pmd_present(pmd)) { + old = pmdp_invalidate_ad(vma, addr, pmdp); + pmd = pmd_mkuffd_wp(old); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long pagemap_hugetlb_category(pte_t pte) +{ + unsigned long categories = PAGE_IS_HUGE; + + /* + * According to pagemap_hugetlb_range(), file-backed HugeTLB + * page cannot be swapped. So PAGE_IS_FILE is not checked for + * swapped pages. + */ + if (pte_present(pte)) { + categories |= PAGE_IS_PRESENT; + if (!huge_pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + if (!PageAnon(pte_page(pte))) + categories |= PAGE_IS_FILE; + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + } + + return categories; +} + +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t ptent) +{ + unsigned long psize; + + if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) + return; + + psize = huge_page_size(hstate_vma(vma)); + + if (is_hugetlb_entry_migration(ptent)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + pte_swp_mkuffd_wp(ptent), psize); + else if (!huge_pte_none(ptent)) + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, + huge_pte_mkuffd_wp(ptent)); + else + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + if (cur_buf->start != addr) + cur_buf->end = addr; + else + cur_buf->start = cur_buf->end = 0; + + p->found_pages -= (end - addr) / PAGE_SIZE; +} +#endif + +static bool pagemap_scan_is_interesting_page(unsigned long categories, + const struct pagemap_scan_private *p) +{ + categories ^= p->arg.category_inverted; + if ((categories & p->arg.category_mask) != p->arg.category_mask) + return false; + if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) + return false; + + return true; +} + +static bool pagemap_scan_is_interesting_vma(unsigned long categories, + const struct pagemap_scan_private *p) +{ + unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; + + categories ^= p->arg.category_inverted; + if ((categories & required) != required) + return false; + + return true; +} + +static int pagemap_scan_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long vma_category = 0; + bool wp_allowed = userfaultfd_wp_async(vma) && + userfaultfd_wp_use_markers(vma); + + if (!wp_allowed) { + /* User requested explicit failure over wp-async capability */ + if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) + return -EPERM; + /* + * User requires wr-protect, and allows silently skipping + * unsupported vmas. + */ + if (p->arg.flags & PM_SCAN_WP_MATCHING) + return 1; + /* + * Then the request doesn't involve wr-protects at all, + * fall through to the rest checks, and allow vma walk. + */ + } + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (wp_allowed) + vma_category |= PAGE_IS_WPALLOWED; + + if (!pagemap_scan_is_interesting_vma(vma_category, p)) + return 1; + + p->cur_vma_category = vma_category; + + return 0; +} + +static bool pagemap_scan_push_range(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + /* + * When there is no output buffer provided at all, the sentinel values + * won't match here. There is no other way for `cur_buf->end` to be + * non-zero other than it being non-empty. + */ + if (addr == cur_buf->end && categories == cur_buf->categories) { + cur_buf->end = end; + return true; + } + + if (cur_buf->end) { + if (p->vec_buf_index >= p->vec_buf_len - 1) + return false; + + cur_buf = &p->vec_buf[++p->vec_buf_index]; + } + + cur_buf->start = addr; + cur_buf->end = end; + cur_buf->categories = categories; + + return true; +} + +static int pagemap_scan_output(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long *end) +{ + unsigned long n_pages, total_pages; + int ret = 0; + + if (!p->vec_buf) + return 0; + + categories &= p->arg.return_mask; + + n_pages = (*end - addr) / PAGE_SIZE; + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || + total_pages > p->arg.max_pages) { + size_t n_too_much = total_pages - p->arg.max_pages; + *end -= n_too_much * PAGE_SIZE; + n_pages -= n_too_much; + ret = -ENOSPC; + } + + if (!pagemap_scan_push_range(categories, p, addr, *end)) { + *end = addr; + n_pages = 0; + ret = -ENOSPC; + } + + p->found_pages += n_pages; + if (ret) + p->arg.walk_end = *end; + + return ret; +} + +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return -ENOENT; + + categories = p->cur_vma_category | + pagemap_thp_category(p, vma, start, *pmd); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + goto out_unlock; + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + /* + * Break huge page into small pages if the WP operation + * needs to be performed on a portion of the huge page. + */ + if (end != start + HPAGE_SIZE) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, start); + pagemap_scan_backout_range(p, start, end); + /* Report as if there was no THP */ + return -ENOENT; + } + + make_uffd_wp_pmd(vma, start, pmd); + flush_tlb_range(vma, start, end); +out_unlock: + spin_unlock(ptl); + return ret; +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + return -ENOENT; +#endif +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long addr, flush_end = 0; + pte_t *pte, *start_pte; + spinlock_t *ptl; + int ret; + + arch_enter_lazy_mmu_mode(); + + ret = pagemap_scan_thp_entry(pmd, start, end, walk); + if (ret != -ENOENT) { + arch_leave_lazy_mmu_mode(); + return ret; + } + + ret = 0; + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + if (!pte) { + arch_leave_lazy_mmu_mode(); + walk->action = ACTION_AGAIN; + return 0; + } + + if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { + /* Fast path for performing exclusive WP */ + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + if (pte_uffd_wp(ptep_get(pte))) + continue; + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = addr + PAGE_SIZE; + } + goto flush_and_return; + } + + if (!p->arg.category_anyof_mask && !p->arg.category_inverted && + p->arg.category_mask == PAGE_IS_WRITTEN && + p->arg.return_mask == PAGE_IS_WRITTEN) { + for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { + unsigned long next = addr + PAGE_SIZE; + + if (pte_uffd_wp(ptep_get(pte))) + continue; + ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, + p, addr, &next); + if (next == addr) + break; + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = next; + } + goto flush_and_return; + } + + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + unsigned long categories = p->cur_vma_category | + pagemap_page_category(p, vma, addr, ptep_get(pte)); + unsigned long next = addr + PAGE_SIZE; + + if (!pagemap_scan_is_interesting_page(categories, p)) + continue; + + ret = pagemap_scan_output(categories, p, addr, &next); + if (next == addr) + break; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + if (~categories & PAGE_IS_WRITTEN) + continue; + + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = next; + } + +flush_and_return: + if (flush_end) + flush_tlb_range(vma, start, addr); + + pte_unmap_unlock(start_pte, ptl); + arch_leave_lazy_mmu_mode(); + + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + pte_t pte; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { + /* Go the short route when not write-protecting pages. */ + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + return 0; + + return pagemap_scan_output(categories, p, start, &end); + } + + i_mmap_lock_write(vma->vm_file->f_mapping); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + if (end != start + HPAGE_SIZE) { + /* Partial HugeTLB page WP isn't possible. */ + pagemap_scan_backout_range(p, start, end); + p->arg.walk_end = start; + ret = 0; + goto out_unlock; + } + + make_uffd_wp_huge_pte(vma, start, ptep, pte); + flush_hugetlb_tlb_range(vma, start, end); + +out_unlock: + spin_unlock(ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + + return ret; +} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, + int depth, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int ret, err; + + if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) + return 0; + + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); + if (addr == end) + return ret; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + return ret; + + err = uffd_wp_range(vma, addr, end - addr, true); + if (err < 0) + ret = err; + + return ret; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + .hugetlb_entry = pagemap_scan_hugetlb_entry, +}; + +static int pagemap_scan_get_args(struct pm_scan_arg *arg, + unsigned long uarg) +{ + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) + return -EFAULT; + + if (arg->size != sizeof(struct pm_scan_arg)) + return -EINVAL; + + /* Validate requested features */ + if (arg->flags & ~PM_SCAN_FLAGS) + return -EINVAL; + if ((arg->category_inverted | arg->category_mask | + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) + return -EINVAL; + + arg->start = untagged_addr((unsigned long)arg->start); + arg->end = untagged_addr((unsigned long)arg->end); + arg->vec = untagged_addr((unsigned long)arg->vec); + + /* Validate memory pointers */ + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) + return -EINVAL; + if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) + return -EFAULT; + if (!arg->vec && arg->vec_len) + return -EINVAL; + if (arg->vec && !access_ok((void __user *)(long)arg->vec, + arg->vec_len * sizeof(struct page_region))) + return -EFAULT; + + /* Fixup default values */ + arg->end = ALIGN(arg->end, PAGE_SIZE); + arg->walk_end = 0; + if (!arg->max_pages) + arg->max_pages = ULONG_MAX; + + return 0; +} + +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, + unsigned long uargl) +{ + struct pm_scan_arg __user *uarg = (void __user *)uargl; + + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) + return -EFAULT; + + return 0; +} + +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) +{ + if (!p->arg.vec_len) + return 0; + + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, + p->arg.vec_len); + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), + GFP_KERNEL); + if (!p->vec_buf) + return -ENOMEM; + + p->vec_buf->start = p->vec_buf->end = 0; + p->vec_out = (struct page_region __user *)(long)p->arg.vec; + + return 0; +} + +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) +{ + const struct page_region *buf = p->vec_buf; + long n = p->vec_buf_index; + + if (!p->vec_buf) + return 0; + + if (buf[n].end != buf[n].start) + n++; + + if (!n) + return 0; + + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) + return -EFAULT; + + p->arg.vec_len -= n; + p->vec_out += n; + + p->vec_buf_index = 0; + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); + p->vec_buf->start = p->vec_buf->end = 0; + + return n; +} + +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) +{ + struct mmu_notifier_range range; + struct pagemap_scan_private p = {0}; + unsigned long walk_start; + size_t n_ranges_out = 0; + int ret; + + ret = pagemap_scan_get_args(&p.arg, uarg); + if (ret) + return ret; + + p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | + p.arg.return_mask; + ret = pagemap_scan_init_bounce_buffer(&p); + if (ret) + return ret; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, p.arg.start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + + for (walk_start = p.arg.start; walk_start < p.arg.end; + walk_start = p.arg.walk_end) { + long n_out; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = mmap_read_lock_killable(mm); + if (ret) + break; + ret = walk_page_range(mm, walk_start, p.arg.end, + &pagemap_scan_ops, &p); + mmap_read_unlock(mm); + + n_out = pagemap_scan_flush_buffer(&p); + if (n_out < 0) + ret = n_out; + else + n_ranges_out += n_out; + + if (ret != -ENOSPC) + break; + + if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) + break; + } + + /* ENOSPC signifies early stop (buffer full) from the walk. */ + if (!ret || ret == -ENOSPC) + ret = n_ranges_out; + + /* The walk_end isn't set when ret is zero */ + if (!p.arg.walk_end) + p.arg.walk_end = p.arg.end; + if (pagemap_scan_writeback_args(&p.arg, uarg)) + ret = -EFAULT; + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + + kfree(p.vec_buf); + return ret; +} + +static long do_pagemap_cmd(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct mm_struct *mm = file->private_data; + + switch (cmd) { + case PAGEMAP_SCAN: + return do_pagemap_scan(mm, arg); + + default: + return -EINVAL; + } +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, + .unlocked_ioctl = do_pagemap_cmd, + .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ @@ -1952,8 +2684,9 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mempolicy *pol; char buffer[64]; + struct mempolicy *pol; + pgoff_t ilx; int nid; if (!mm) @@ -1962,7 +2695,7 @@ static int show_numa_map(struct seq_file *m, void *v) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); @@ -1974,10 +2707,10 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_path(m, file_user_path(file), "\n\t= "); + } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); - } else if (is_stack(vma)) { + } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2c8b62265981..bce674533000 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static int is_stack(struct vm_area_struct *vma) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; -} - /* * display a single VMA to a sequenced file */ @@ -170,8 +157,8 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); - } else if (mm && is_stack(vma)) { + seq_path(m, file_user_path(file), ""); + } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); } @@ -188,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p) return nommu_vma_show(m, _p); } -static void *m_start(struct seq_file *m, loff_t *pos) +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned long addr = *pos; - /* See m_next(). Zero at the start or after lseek. */ - if (addr == -1UL) + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) return NULL; /* pin the task and mm whilst we play with them */ @@ -205,44 +205,41 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } - /* start the next element from addr */ - vma = find_vma(mm, addr); - if (vma) - return vma; + vma_iter_init(&priv->iter, mm, last_addr); - mmap_read_unlock(mm); - mmput(mm); - return NULL; + return proc_get_vma(priv, ppos); } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(_vml)) { - mmap_read_unlock(priv->mm); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } -static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) { - struct vm_area_struct *vma = _p; - - *pos = vma->vm_end; - return find_vma(vma->vm_mm, vma->vm_end); + return proc_get_vma(m->private, ppos); } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index a553273fbd41..0e5050d6ab64 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index c49d554cc9ae..3acc38600cd1 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config PSTORE tristate "Persistent store support" - select CRYPTO if PSTORE_COMPRESS default n help This option enables generic access to platform level @@ -22,99 +21,18 @@ config PSTORE_DEFAULT_KMSG_BYTES Defines default size of pstore kernel log storage. Can be enlarged if needed, not recommended to shrink it. -config PSTORE_DEFLATE_COMPRESS - tristate "DEFLATE (ZLIB) compression" - default y - depends on PSTORE - select CRYPTO_DEFLATE - help - This option enables DEFLATE (also known as ZLIB) compression - algorithm support. - -config PSTORE_LZO_COMPRESS - tristate "LZO compression" - depends on PSTORE - select CRYPTO_LZO - help - This option enables LZO compression algorithm support. - -config PSTORE_LZ4_COMPRESS - tristate "LZ4 compression" - depends on PSTORE - select CRYPTO_LZ4 - help - This option enables LZ4 compression algorithm support. - -config PSTORE_LZ4HC_COMPRESS - tristate "LZ4HC compression" - depends on PSTORE - select CRYPTO_LZ4HC - help - This option enables LZ4HC (high compression) mode algorithm. - -config PSTORE_842_COMPRESS - bool "842 compression" - depends on PSTORE - select CRYPTO_842 - help - This option enables 842 compression algorithm support. - -config PSTORE_ZSTD_COMPRESS - bool "zstd compression" - depends on PSTORE - select CRYPTO_ZSTD - help - This option enables zstd compression algorithm support. - config PSTORE_COMPRESS - def_bool y + bool "Pstore compression (deflate)" depends on PSTORE - depends on PSTORE_DEFLATE_COMPRESS || PSTORE_LZO_COMPRESS || \ - PSTORE_LZ4_COMPRESS || PSTORE_LZ4HC_COMPRESS || \ - PSTORE_842_COMPRESS || PSTORE_ZSTD_COMPRESS - -choice - prompt "Default pstore compression algorithm" - depends on PSTORE_COMPRESS + select ZLIB_INFLATE + select ZLIB_DEFLATE + default y help - This option chooses the default active compression algorithm. - This change be changed at boot with "pstore.compress=..." on - the kernel command line. - - Currently, pstore has support for 6 compression algorithms: - deflate, lzo, lz4, lz4hc, 842 and zstd. - - The default compression algorithm is deflate. - - config PSTORE_DEFLATE_COMPRESS_DEFAULT - bool "deflate" if PSTORE_DEFLATE_COMPRESS - - config PSTORE_LZO_COMPRESS_DEFAULT - bool "lzo" if PSTORE_LZO_COMPRESS - - config PSTORE_LZ4_COMPRESS_DEFAULT - bool "lz4" if PSTORE_LZ4_COMPRESS - - config PSTORE_LZ4HC_COMPRESS_DEFAULT - bool "lz4hc" if PSTORE_LZ4HC_COMPRESS - - config PSTORE_842_COMPRESS_DEFAULT - bool "842" if PSTORE_842_COMPRESS - - config PSTORE_ZSTD_COMPRESS_DEFAULT - bool "zstd" if PSTORE_ZSTD_COMPRESS - -endchoice - -config PSTORE_COMPRESS_DEFAULT - string - depends on PSTORE_COMPRESS - default "deflate" if PSTORE_DEFLATE_COMPRESS_DEFAULT - default "lzo" if PSTORE_LZO_COMPRESS_DEFAULT - default "lz4" if PSTORE_LZ4_COMPRESS_DEFAULT - default "lz4hc" if PSTORE_LZ4HC_COMPRESS_DEFAULT - default "842" if PSTORE_842_COMPRESS_DEFAULT - default "zstd" if PSTORE_ZSTD_COMPRESS_DEFAULT + Whether pstore records should be compressed before being written to + the backing store. This is implemented using the zlib 'deflate' + algorithm, using the library implementation instead of using the full + blown crypto API. This reduces the risk of secondary oopses or other + problems while pstore is recording panic metadata. config PSTORE_CONSOLE bool "Log kernel console messages" diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index ffbadb8b3032..d41c20d1b5e8 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -54,7 +54,7 @@ static void free_pstore_private(struct pstore_private *private) if (!private) return; if (private->record) { - kfree(private->record->buf); + kvfree(private->record->buf); kfree(private->record->priv); kfree(private->record); } @@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); } return inode; } @@ -390,7 +390,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode->i_private = private; if (record->time.tv_sec) - inode->i_mtime = inode->i_ctime = record->time; + inode_set_mtime_to_ts(inode, + inode_set_ctime_to_ts(inode, record->time)); d_add(dentry, inode); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index cbc0b468c1ab..03425928d2fb 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -14,24 +14,17 @@ #include <linux/init.h> #include <linux/kmsg_dump.h> #include <linux/console.h> +#include <linux/mm.h> #include <linux/module.h> #include <linux/pstore.h> -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) -#include <linux/lzo.h> -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) -#include <linux/lz4.h> -#endif -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) -#include <linux/zstd.h> -#endif -#include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> +#include <linux/vmalloc.h> #include <linux/workqueue.h> +#include <linux/zlib.h> #include "internal.h" @@ -80,12 +73,21 @@ static char *backend; module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "specific backend to use"); -static char *compress = -#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT - CONFIG_PSTORE_COMPRESS_DEFAULT; -#else - NULL; -#endif +/* + * pstore no longer implements compression via the crypto API, and only + * supports zlib deflate compression implemented using the zlib library + * interface. This removes additional complexity which is hard to justify for a + * diagnostic facility that has to operate in conditions where the system may + * have become unstable. Zlib deflate is comparatively small in terms of code + * size, and compresses ASCII text comparatively well. In terms of compression + * speed, deflate is not the best performer but for recording the log output on + * a kernel panic, this is not considered critical. + * + * The only remaining arguments supported by the compress= module parameter are + * 'deflate' and 'none'. To retain compatibility with existing installations, + * all other values are logged and replaced with 'deflate'. + */ +static char *compress = "deflate"; module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); @@ -94,16 +96,16 @@ unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; module_param(kmsg_bytes, ulong, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); -/* Compression parameters */ -static struct crypto_comp *tfm; +static void *compress_workspace; -struct pstore_zbackend { - int (*zbufsize)(size_t size); - const char *name; -}; +/* + * Compression is only used for dmesg output, which consists of low-entropy + * ASCII text, and so we can assume worst-case 60%. + */ +#define DMESG_COMP_PERCENT 60 static char *big_oops_buf; -static size_t big_oops_buf_sz; +static size_t max_compressed_size; void pstore_set_kmsg_bytes(int bytes) { @@ -168,206 +170,93 @@ static bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } } -#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) -static int zbufsize_deflate(size_t size) -{ - size_t cmpr; - - switch (size) { - /* buffer range for efivars */ - case 1000 ... 2000: - cmpr = 56; - break; - case 2001 ... 3000: - cmpr = 54; - break; - case 3001 ... 3999: - cmpr = 52; - break; - /* buffer range for nvram, erst */ - case 4000 ... 10000: - cmpr = 45; - break; - default: - cmpr = 60; - break; - } - - return (size * 100) / cmpr; -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) -static int zbufsize_lzo(size_t size) -{ - return lzo1x_worst_compress(size); -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) -static int zbufsize_lz4(size_t size) -{ - return LZ4_compressBound(size); -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) -static int zbufsize_842(size_t size) -{ - return size; -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) -static int zbufsize_zstd(size_t size) -{ - return zstd_compress_bound(size); -} -#endif - -static const struct pstore_zbackend *zbackend __ro_after_init; - -static const struct pstore_zbackend zbackends[] = { -#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) - { - .zbufsize = zbufsize_deflate, - .name = "deflate", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) - { - .zbufsize = zbufsize_lzo, - .name = "lzo", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) - { - .zbufsize = zbufsize_lz4, - .name = "lz4", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) - { - .zbufsize = zbufsize_lz4, - .name = "lz4hc", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) - { - .zbufsize = zbufsize_842, - .name = "842", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) - { - .zbufsize = zbufsize_zstd, - .name = "zstd", - }, -#endif - { } -}; - static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { + struct z_stream_s zstream = { + .next_in = in, + .avail_in = inlen, + .next_out = out, + .avail_out = outlen, + .workspace = compress_workspace, + }; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); - if (ret) { - pr_err("crypto_comp_compress failed, ret = %d!\n", ret); - return ret; - } + ret = zlib_deflateInit2(&zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, + -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (ret != Z_OK) + return -EINVAL; + + ret = zlib_deflate(&zstream, Z_FINISH); + if (ret != Z_STREAM_END) + return -EINVAL; - return outlen; + ret = zlib_deflateEnd(&zstream); + if (ret != Z_OK) + pr_warn_once("zlib_deflateEnd() failed: %d\n", ret); + + return zstream.total_out; } static void allocate_buf_for_compression(void) { - struct crypto_comp *ctx; - int size; + size_t compressed_size; char *buf; - /* Skip if not built-in or compression backend not selected yet. */ - if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend) - return; - - /* Skip if no pstore backend yet or compression init already done. */ - if (!psinfo || tfm) - return; - - if (!crypto_has_comp(zbackend->name, 0, 0)) { - pr_err("Unknown compression: %s\n", zbackend->name); + /* Skip if not built-in or compression disabled. */ + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !compress || + !strcmp(compress, "none")) { + compress = NULL; return; } - size = zbackend->zbufsize(psinfo->bufsize); - if (size <= 0) { - pr_err("Invalid compression size for %s: %d\n", - zbackend->name, size); - return; + if (strcmp(compress, "deflate")) { + pr_err("Unsupported compression '%s', falling back to deflate\n", + compress); + compress = "deflate"; } - buf = kmalloc(size, GFP_KERNEL); + /* + * The compression buffer only needs to be as large as the maximum + * uncompressed record size, since any record that would be expanded by + * compression is just stored uncompressed. + */ + compressed_size = (psinfo->bufsize * 100) / DMESG_COMP_PERCENT; + buf = kvzalloc(compressed_size, GFP_KERNEL); if (!buf) { - pr_err("Failed %d byte compression buffer allocation for: %s\n", - size, zbackend->name); + pr_err("Failed %zu byte compression buffer allocation for: %s\n", + psinfo->bufsize, compress); return; } - ctx = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(ctx)) { - kfree(buf); - pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(ctx)); + compress_workspace = + vmalloc(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)); + if (!compress_workspace) { + pr_err("Failed to allocate zlib deflate workspace\n"); + kvfree(buf); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = ctx; - big_oops_buf_sz = size; big_oops_buf = buf; + max_compressed_size = compressed_size; - pr_info("Using crash dump compression: %s\n", zbackend->name); + pr_info("Using crash dump compression: %s\n", compress); } static void free_buf_for_compression(void) { - if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - crypto_free_comp(tfm); - tfm = NULL; + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress_workspace) { + vfree(compress_workspace); + compress_workspace = NULL; } - kfree(big_oops_buf); - big_oops_buf = NULL; - big_oops_buf_sz = 0; -} -/* - * Called when compression fails, since the printk buffer - * would be fetched for compression calling it again when - * compression fails would have moved the iterator of - * printk buffer which results in fetching old contents. - * Copy the recent messages from big_oops_buf to psinfo->buf - */ -static size_t copy_kmsg_to_buffer(int hsize, size_t len) -{ - size_t total_len; - size_t diff; - - total_len = hsize + len; - - if (total_len > psinfo->bufsize) { - diff = total_len - psinfo->bufsize + hsize; - memcpy(psinfo->buf, big_oops_buf, hsize); - memcpy(psinfo->buf + hsize, big_oops_buf + diff, - psinfo->bufsize - hsize); - total_len = psinfo->bufsize; - } else - memcpy(psinfo->buf, big_oops_buf, total_len); - - return total_len; + kvfree(big_oops_buf); + big_oops_buf = NULL; + max_compressed_size = 0; } void pstore_record_init(struct pstore_record *record, @@ -426,13 +315,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.part = part; record.buf = psinfo->buf; - if (big_oops_buf) { - dst = big_oops_buf; - dst_size = big_oops_buf_sz; - } else { - dst = psinfo->buf; - dst_size = psinfo->bufsize; - } + dst = big_oops_buf ?: psinfo->buf; + dst_size = max_compressed_size ?: psinfo->bufsize; /* Write dump header. */ header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why, @@ -453,8 +337,15 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.compressed = true; record.size = zipped_len; } else { - record.size = copy_kmsg_to_buffer(header_size, - dump_size); + /* + * Compression failed, so the buffer is most + * likely filled with binary data that does not + * compress as well as ASCII text. Copy as much + * of the uncompressed data as possible into + * the pstore record, and discard the rest. + */ + record.size = psinfo->bufsize; + memcpy(psinfo->buf, dst, psinfo->bufsize); } } else { record.size = header_size + dump_size; @@ -549,7 +440,7 @@ static int pstore_write_user_compat(struct pstore_record *record, if (record->buf) return -EINVAL; - record->buf = memdup_user(buf, record->size); + record->buf = vmemdup_user(buf, record->size); if (IS_ERR(record->buf)) { ret = PTR_ERR(record->buf); goto out; @@ -557,7 +448,7 @@ static int pstore_write_user_compat(struct pstore_record *record, ret = record->psi->write(record); - kfree(record->buf); + kvfree(record->buf); out: record->buf = NULL; @@ -573,6 +464,8 @@ out: */ int pstore_register(struct pstore_info *psi) { + char *new_backend; + if (backend && strcmp(backend, psi->name)) { pr_warn("backend '%s' already in use: ignoring '%s'\n", backend, psi->name); @@ -593,11 +486,16 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } + new_backend = kstrdup(psi->name, GFP_KERNEL); + if (!new_backend) + return -ENOMEM; + mutex_lock(&psinfo_lock); if (psinfo) { pr_warn("backend '%s' already loaded: ignoring '%s'\n", psinfo->name, psi->name); mutex_unlock(&psinfo_lock); + kfree(new_backend); return -EBUSY; } @@ -630,7 +528,7 @@ int pstore_register(struct pstore_info *psi) * Update the module parameter backend, so it is visible * through /sys/module/pstore/parameters/backend */ - backend = kstrdup(psi->name, GFP_KERNEL); + backend = new_backend; pr_info("Registered %s as persistent store backend\n", psi->name); @@ -681,11 +579,13 @@ void pstore_unregister(struct pstore_info *psi) } EXPORT_SYMBOL_GPL(pstore_unregister); -static void decompress_record(struct pstore_record *record) +static void decompress_record(struct pstore_record *record, + struct z_stream_s *zstream) { int ret; int unzipped_len; char *unzipped, *workspace; + size_t max_uncompressed_size; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed) return; @@ -697,40 +597,51 @@ static void decompress_record(struct pstore_record *record) } /* Missing compression buffer means compression was not initialized. */ - if (!big_oops_buf) { + if (!zstream->workspace) { pr_warn("no decompression method initialized!\n"); return; } + ret = zlib_inflateReset(zstream); + if (ret != Z_OK) { + pr_err("zlib_inflateReset() failed, ret = %d!\n", ret); + return; + } + /* Allocate enough space to hold max decompression and ECC. */ - unzipped_len = big_oops_buf_sz; - workspace = kmalloc(unzipped_len + record->ecc_notice_size, - GFP_KERNEL); + max_uncompressed_size = 3 * psinfo->bufsize; + workspace = kvzalloc(max_uncompressed_size + record->ecc_notice_size, + GFP_KERNEL); if (!workspace) return; - /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_comp_decompress(tfm, record->buf, record->size, - workspace, &unzipped_len); - if (ret) { - pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); - kfree(workspace); + zstream->next_in = record->buf; + zstream->avail_in = record->size; + zstream->next_out = workspace; + zstream->avail_out = max_uncompressed_size; + + ret = zlib_inflate(zstream, Z_FINISH); + if (ret != Z_STREAM_END) { + pr_err_ratelimited("zlib_inflate() failed, ret = %d!\n", ret); + kvfree(workspace); return; } + unzipped_len = zstream->total_out; + /* Append ECC notice to decompressed buffer. */ memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); /* Copy decompressed contents into an minimum-sized allocation. */ - unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, - GFP_KERNEL); - kfree(workspace); + unzipped = kvmemdup(workspace, unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + kvfree(workspace); if (!unzipped) return; /* Swap out compressed contents with decompressed contents. */ - kfree(record->buf); + kvfree(record->buf); record->buf = unzipped; record->size = unzipped_len; record->compressed = false; @@ -747,10 +658,17 @@ void pstore_get_backend_records(struct pstore_info *psi, { int failed = 0; unsigned int stop_loop = 65536; + struct z_stream_s zstream = {}; if (!psi || !root) return; + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) { + zstream.workspace = kvmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + zlib_inflateInit2(&zstream, -DEF_WBITS); + } + mutex_lock(&psi->read_mutex); if (psi->open && psi->open(psi)) goto out; @@ -779,11 +697,11 @@ void pstore_get_backend_records(struct pstore_info *psi, break; } - decompress_record(record); + decompress_record(record, &zstream); rc = pstore_mkfile(root, record); if (rc) { /* pstore_mkfile() did not take record, so free it. */ - kfree(record->buf); + kvfree(record->buf); kfree(record->priv); kfree(record); if (rc != -EEXIST || !quiet) @@ -795,6 +713,12 @@ void pstore_get_backend_records(struct pstore_info *psi, out: mutex_unlock(&psi->read_mutex); + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) { + if (zlib_inflateEnd(&zstream) != Z_OK) + pr_warn("zlib_inflateEnd() failed\n"); + kvfree(zstream.workspace); + } + if (failed) pr_warn("failed to create %d record(s) from '%s'\n", failed, psi->name); @@ -818,34 +742,10 @@ static void pstore_timefunc(struct timer_list *unused) pstore_timer_kick(); } -static void __init pstore_choose_compression(void) -{ - const struct pstore_zbackend *step; - - if (!compress) - return; - - for (step = zbackends; step->name; step++) { - if (!strcmp(compress, step->name)) { - zbackend = step; - return; - } - } -} - static int __init pstore_init(void) { int ret; - pstore_choose_compression(); - - /* - * Check if any pstore backends registered earlier but did not - * initialize compression because crypto was not ready. If so, - * initialize compression now. - */ - allocate_buf_for_compression(); - ret = pstore_init_fs(); if (ret) free_buf_for_compression(); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2f625e1fa8d8..d36702c7ab3c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -20,6 +20,7 @@ #include <linux/compiler.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/mm.h> #include "internal.h" #include "ram_internal.h" @@ -268,7 +269,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) /* ECC correction notice */ record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); + record->buf = kvzalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); if (record->buf == NULL) { size = -ENOMEM; goto out; @@ -282,7 +283,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) out: if (free_prz) { - kfree(prz->old_log); + kvfree(prz->old_log); kfree(prz); } @@ -833,7 +834,7 @@ static int ramoops_probe(struct platform_device *pdev) */ if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) { cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; - cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + cxt->pstore.buf = kvzalloc(cxt->pstore.bufsize, GFP_KERNEL); if (!cxt->pstore.buf) { pr_err("cannot allocate pstore crash dump buffer\n"); err = -ENOMEM; @@ -866,7 +867,7 @@ static int ramoops_probe(struct platform_device *pdev) return 0; fail_buf: - kfree(cxt->pstore.buf); + kvfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; fail_init: @@ -881,7 +882,7 @@ static void ramoops_remove(struct platform_device *pdev) pstore_unregister(&cxt->pstore); - kfree(cxt->pstore.buf); + kvfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; ramoops_free_przs(cxt); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 85aaf0fc6d7d..650e437b55e6 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> +#include <linux/mm.h> #include <asm/page.h> #include "ram_internal.h" @@ -24,12 +25,10 @@ /** * struct persistent_ram_buffer - persistent circular RAM buffer * - * @sig: - * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) - * @start: - * offset into @data where the beginning of the stored bytes begin - * @size: - * number of valid bytes stored in @data + * @sig: Signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) + * @start: First valid byte in the buffer. + * @size: Number of valid bytes in the buffer. + * @data: The contents of the buffer. */ struct persistent_ram_buffer { uint32_t sig; @@ -301,7 +300,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz) if (!prz->old_log) { persistent_ram_ecc_old(prz); - prz->old_log = kmalloc(size, GFP_KERNEL); + prz->old_log = kvzalloc(size, GFP_KERNEL); } if (!prz->old_log) { pr_err("failed to allocate buffer\n"); @@ -385,7 +384,7 @@ void *persistent_ram_old(struct persistent_ram_zone *prz) void persistent_ram_free_old(struct persistent_ram_zone *prz) { - kfree(prz->old_log); + kvfree(prz->old_log); prz->old_log = NULL; prz->old_log_size = 0; } @@ -519,7 +518,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { - if (buffer_size(prz) == 0) { + if (buffer_size(prz) == 0 && buffer_start(prz) == 0) { pr_debug("found existing empty buffer\n"); return 0; } diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig index 45b5b98376c4..a2eb826e76c6 100644 --- a/fs/qnx4/Kconfig +++ b/fs/qnx4/Kconfig @@ -2,6 +2,7 @@ config QNX4FS_FS tristate "QNX4 file system support (read only)" depends on BLOCK + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 4 and QNX 6 (the latter is also called QNX RTP). diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 391ea402920d..6eb9bb369b57 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -301,12 +301,9 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid)); set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); inode->i_size = le32_to_cpu(raw_inode->di_size); - inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime); - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_mtime(inode, le32_to_cpu(raw_inode->di_mtime), 0); + inode_set_atime(inode, le32_to_cpu(raw_inode->di_atime), 0); + inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0); inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig index 6a9d6bce1586..8e865d72204e 100644 --- a/fs/qnx6/Kconfig +++ b/fs/qnx6/Kconfig @@ -2,6 +2,7 @@ config QNX6FS_FS tristate "QNX6 file system support (read only)" depends on BLOCK && CRC32 + select BUFFER_HEAD help This is the file system used by the real-time operating systems QNX 6 (also called QNX RTP). diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 85b2fa3b211c..a286c545717f 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -558,12 +558,9 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid)); inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size); - inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime); - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0); + inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0); + inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0); /* calc blocks based on 512 byte blocksize */ inode->i_blocks = (inode->i_size + 511) >> 9; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index e3e4f4047657..58b5de081b57 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -225,18 +225,26 @@ static void put_quota_format(struct quota_format_type *fmt) /* * Dquot List Management: - * The quota code uses four lists for dquot management: the inuse_list, - * free_dquots, dqi_dirty_list, and dquot_hash[] array. A single dquot - * structure may be on some of those lists, depending on its current state. + * The quota code uses five lists for dquot management: the inuse_list, + * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array. + * A single dquot structure may be on some of those lists, depending on + * its current state. * * All dquots are placed to the end of inuse_list when first created, and this * list is used for invalidate operation, which must look at every dquot. * - * Unused dquots (dq_count == 0) are added to the free_dquots list when freed, - * and this list is searched whenever we need an available dquot. Dquots are - * removed from the list as soon as they are used again, and - * dqstats.free_dquots gives the number of dquots on the list. When - * dquot is invalidated it's completely released from memory. + * When the last reference of a dquot is dropped, the dquot is added to + * releasing_dquots. We'll then queue work item which will call + * synchronize_srcu() and after that perform the final cleanup of all the + * dquots on the list. Each cleaned up dquot is moved to free_dquots list. + * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot + * struct. + * + * Unused and cleaned up dquots are in the free_dquots list and this list is + * searched whenever we need an available dquot. Dquots are removed from the + * list as soon as they are used again and dqstats.free_dquots gives the number + * of dquots on the list. When dquot is invalidated it's completely released + * from memory. * * Dirty dquots are added to the dqi_dirty_list of quota_info when mark * dirtied, and this list is searched when writing dirty dquots back to @@ -250,6 +258,7 @@ static void put_quota_format(struct quota_format_type *fmt) static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); +static LIST_HEAD(releasing_dquots); static unsigned int dq_hash_bits, dq_hash_mask; static struct hlist_head *dquot_hash; @@ -260,6 +269,9 @@ static qsize_t inode_get_rsv_space(struct inode *inode); static qsize_t __inode_get_rsv_space(struct inode *inode); static int __dquot_initialize(struct inode *inode, int type); +static void quota_release_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn); + static inline unsigned int hashfn(const struct super_block *sb, struct kqid qid) { @@ -305,12 +317,21 @@ static inline void put_dquot_last(struct dquot *dquot) dqstats_inc(DQST_FREE_DQUOTS); } +static inline void put_releasing_dquots(struct dquot *dquot) +{ + list_add_tail(&dquot->dq_free, &releasing_dquots); + set_bit(DQ_RELEASING_B, &dquot->dq_flags); +} + static inline void remove_free_dquot(struct dquot *dquot) { if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); - dqstats_dec(DQST_FREE_DQUOTS); + if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags)) + dqstats_dec(DQST_FREE_DQUOTS); + else + clear_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void put_inuse(struct dquot *dquot) @@ -336,6 +357,11 @@ static void wait_on_dquot(struct dquot *dquot) mutex_unlock(&dquot->dq_lock); } +static inline int dquot_active(struct dquot *dquot) +{ + return test_bit(DQ_ACTIVE_B, &dquot->dq_flags); +} + static inline int dquot_dirty(struct dquot *dquot) { return test_bit(DQ_MOD_B, &dquot->dq_flags); @@ -351,14 +377,14 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) { int ret = 1; - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + if (!dquot_active(dquot)) return 0; if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags); /* If quota is dirty already, we don't have to acquire dq_list_lock */ - if (test_bit(DQ_MOD_B, &dquot->dq_flags)) + if (dquot_dirty(dquot)) return 1; spin_lock(&dq_list_lock); @@ -440,7 +466,7 @@ int dquot_acquire(struct dquot *dquot) smp_mb__before_atomic(); set_bit(DQ_READ_B, &dquot->dq_flags); /* Instantiate dquot if needed */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) { + if (!dquot_active(dquot) && !dquot->dq_off) { ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); /* Write the info if needed */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { @@ -482,7 +508,7 @@ int dquot_commit(struct dquot *dquot) goto out_lock; /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + if (dquot_active(dquot)) ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; @@ -547,6 +573,8 @@ static void invalidate_dquots(struct super_block *sb, int type) struct dquot *dquot, *tmp; restart: + flush_delayed_work("a_release_work); + spin_lock(&dq_list_lock); list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { if (dquot->dq_sb != sb) @@ -574,6 +602,15 @@ restart: goto restart; } /* + * The last user already dropped its reference but dquot didn't + * get fully cleaned up yet. Restart the scan which flushes the + * work cleaning up released dquots. + */ + if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + goto restart; + } + /* * Quota now has no users and it has been written on last * dqput() */ @@ -597,7 +634,7 @@ int dquot_scan_active(struct super_block *sb, spin_lock(&dq_list_lock); list_for_each_entry(dquot, &inuse_list, dq_inuse) { - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + if (!dquot_active(dquot)) continue; if (dquot->dq_sb != sb) continue; @@ -612,7 +649,7 @@ int dquot_scan_active(struct super_block *sb, * outstanding call and recheck the DQ_ACTIVE_B after that. */ wait_on_dquot(dquot); - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + if (dquot_active(dquot)) { ret = fn(dquot, priv); if (ret < 0) goto out; @@ -628,6 +665,18 @@ out: } EXPORT_SYMBOL(dquot_scan_active); +static inline int dquot_write_dquot(struct dquot *dquot) +{ + int ret = dquot->dq_sb->dq_op->write_dquot(dquot); + if (ret < 0) { + quota_error(dquot->dq_sb, "Can't write quota structure " + "(error %d). Quota may get out of sync!", ret); + /* Clear dirty bit anyway to avoid infinite loop. */ + clear_dquot_dirty(dquot); + } + return ret; +} + /* Write all dquot structures to quota files */ int dquot_writeback_dquots(struct super_block *sb, int type) { @@ -651,23 +700,23 @@ int dquot_writeback_dquots(struct super_block *sb, int type) dquot = list_first_entry(&dirty, struct dquot, dq_dirty); - WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); + WARN_ON(!dquot_active(dquot)); + /* If the dquot is releasing we should not touch it */ + if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + flush_delayed_work("a_release_work); + spin_lock(&dq_list_lock); + continue; + } /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ dqgrab(dquot); spin_unlock(&dq_list_lock); - err = sb->dq_op->write_dquot(dquot); - if (err) { - /* - * Clear dirty bit anyway to avoid infinite - * loop here. - */ - clear_dquot_dirty(dquot); - if (!ret) - ret = err; - } + err = dquot_write_dquot(dquot); + if (err && !ret) + ret = err; dqput(dquot); spin_lock(&dq_list_lock); } @@ -754,19 +803,53 @@ dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } -static struct shrinker dqcache_shrinker = { - .count_objects = dqcache_shrink_count, - .scan_objects = dqcache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; +/* + * Safely release dquot and put reference to dquot. + */ +static void quota_release_workfn(struct work_struct *work) +{ + struct dquot *dquot; + struct list_head rls_head; + + spin_lock(&dq_list_lock); + /* Exchange the list head to avoid livelock. */ + list_replace_init(&releasing_dquots, &rls_head); + spin_unlock(&dq_list_lock); + synchronize_srcu(&dquot_srcu); + +restart: + spin_lock(&dq_list_lock); + while (!list_empty(&rls_head)) { + dquot = list_first_entry(&rls_head, struct dquot, dq_free); + WARN_ON_ONCE(atomic_read(&dquot->dq_count)); + /* + * Note that DQ_RELEASING_B protects us from racing with + * invalidate_dquots() calls so we are safe to work with the + * dquot even after we drop dq_list_lock. + */ + if (dquot_dirty(dquot)) { + spin_unlock(&dq_list_lock); + /* Commit dquot before releasing */ + dquot_write_dquot(dquot); + goto restart; + } + if (dquot_active(dquot)) { + spin_unlock(&dq_list_lock); + dquot->dq_sb->dq_op->release_dquot(dquot); + goto restart; + } + /* Dquot is inactive and clean, now move it to free list */ + remove_free_dquot(dquot); + put_dquot_last(dquot); + } + spin_unlock(&dq_list_lock); +} /* * Put reference to dquot */ void dqput(struct dquot *dquot) { - int ret; - if (!dquot) return; #ifdef CONFIG_QUOTA_DEBUG @@ -778,7 +861,7 @@ void dqput(struct dquot *dquot) } #endif dqstats_inc(DQST_DROPS); -we_slept: + spin_lock(&dq_list_lock); if (atomic_read(&dquot->dq_count) > 1) { /* We have more than one user... nothing to do */ @@ -790,35 +873,16 @@ we_slept: spin_unlock(&dq_list_lock); return; } + /* Need to release dquot? */ - if (dquot_dirty(dquot)) { - spin_unlock(&dq_list_lock); - /* Commit dquot before releasing */ - ret = dquot->dq_sb->dq_op->write_dquot(dquot); - if (ret < 0) { - quota_error(dquot->dq_sb, "Can't write quota structure" - " (error %d). Quota may get out of sync!", - ret); - /* - * We clear dirty bit anyway, so that we avoid - * infinite loop here - */ - clear_dquot_dirty(dquot); - } - goto we_slept; - } - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { - spin_unlock(&dq_list_lock); - dquot->dq_sb->dq_op->release_dquot(dquot); - goto we_slept; - } - atomic_dec(&dquot->dq_count); #ifdef CONFIG_QUOTA_DEBUG /* sanity check */ BUG_ON(!list_empty(&dquot->dq_free)); #endif - put_dquot_last(dquot); + put_releasing_dquots(dquot); + atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); + queue_delayed_work(system_unbound_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); @@ -905,10 +969,10 @@ we_slept: dqstats_inc(DQST_LOOKUPS); } /* Wait for dq_lock - after this we know that either dquot_release() is - * already finished or it will be canceled due to dq_count > 1 test */ + * already finished or it will be canceled due to dq_count > 0 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + if (!dquot_active(dquot)) { int err; err = sb->dq_op->acquire_dquot(dquot); @@ -1014,59 +1078,7 @@ out: return err; } -/* - * Remove references to dquots from inode and add dquot to list for freeing - * if we have the last reference to dquot - */ -static void remove_inode_dquot_ref(struct inode *inode, int type, - struct list_head *tofree_head) -{ - struct dquot **dquots = i_dquot(inode); - struct dquot *dquot = dquots[type]; - - if (!dquot) - return; - - dquots[type] = NULL; - if (list_empty(&dquot->dq_free)) { - /* - * The inode still has reference to dquot so it can't be in the - * free list - */ - spin_lock(&dq_list_lock); - list_add(&dquot->dq_free, tofree_head); - spin_unlock(&dq_list_lock); - } else { - /* - * Dquot is already in a list to put so we won't drop the last - * reference here. - */ - dqput(dquot); - } -} - -/* - * Free list of dquots - * Dquots are removed from inodes and no new references can be got so we are - * the only ones holding reference - */ -static void put_dquot_list(struct list_head *tofree_head) -{ - struct list_head *act_head; - struct dquot *dquot; - - act_head = tofree_head->next; - while (act_head != tofree_head) { - dquot = list_entry(act_head, struct dquot, dq_free); - act_head = act_head->next; - /* Remove dquot from the list so we won't have problems... */ - list_del_init(&dquot->dq_free); - dqput(dquot); - } -} - -static void remove_dquot_ref(struct super_block *sb, int type, - struct list_head *tofree_head) +static void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; #ifdef CONFIG_QUOTA_DEBUG @@ -1083,11 +1095,16 @@ static void remove_dquot_ref(struct super_block *sb, int type, */ spin_lock(&dq_data_lock); if (!IS_NOQUOTA(inode)) { + struct dquot **dquots = i_dquot(inode); + struct dquot *dquot = dquots[type]; + #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - remove_inode_dquot_ref(inode, type, tofree_head); + dquots[type] = NULL; + if (dquot) + dqput(dquot); } spin_unlock(&dq_data_lock); } @@ -1104,13 +1121,8 @@ static void remove_dquot_ref(struct super_block *sb, int type, /* Gather all references from inodes and drop them */ static void drop_dquot_ref(struct super_block *sb, int type) { - LIST_HEAD(tofree_head); - - if (sb->dq_op) { - remove_dquot_ref(sb, type, &tofree_head); - synchronize_srcu(&dquot_srcu); - put_dquot_list(&tofree_head); - } + if (sb->dq_op) + remove_dquot_ref(sb, type); } static inline @@ -1425,7 +1437,7 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_NOWARN; } -static int dquot_active(const struct inode *inode) +static int inode_quota_active(const struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1448,7 +1460,7 @@ static int __dquot_initialize(struct inode *inode, int type) qsize_t rsv; int ret = 0; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return 0; dquots = i_dquot(inode); @@ -1556,7 +1568,7 @@ bool dquot_initialize_needed(struct inode *inode) struct dquot **dquots; int i; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return false; dquots = i_dquot(inode); @@ -1667,7 +1679,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) int reserve = flags & DQUOT_SPACE_RESERVE; struct dquot **dquots; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; @@ -1737,7 +1749,7 @@ int dquot_alloc_inode(struct inode *inode) struct dquot_warn warn[MAXQUOTAS]; struct dquot * const *dquots; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; @@ -1780,7 +1792,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) struct dquot **dquots; int cnt, index; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); @@ -1822,7 +1834,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) struct dquot **dquots; int cnt, index; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); @@ -1866,7 +1878,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) struct dquot **dquots; int reserve = flags & DQUOT_SPACE_RESERVE, index; - if (!dquot_active(inode)) { + if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; @@ -1921,7 +1933,7 @@ void dquot_free_inode(struct inode *inode) struct dquot * const *dquots; int index; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return; dquots = i_dquot(inode); @@ -2093,7 +2105,7 @@ int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct super_block *sb = inode->i_sb; int ret; - if (!dquot_active(inode)) + if (!inode_quota_active(inode)) return 0; if (i_uid_needs_update(idmap, iattr, inode)) { @@ -2333,6 +2345,20 @@ static int vfs_setup_quota_inode(struct inode *inode, int type) if (sb_has_quota_loaded(sb, type)) return -EBUSY; + /* + * Quota files should never be encrypted. They should be thought of as + * filesystem metadata, not user data. New-style internal quota files + * cannot be encrypted by users anyway, but old-style external quota + * files could potentially be incorrectly created in an encrypted + * directory, hence this explicit check. Some reasons why encrypted + * quota files don't work include: (1) some filesystems that support + * encryption don't handle it in their quota_read and quota_write, and + * (2) cleaning up encrypted quota files at unmount would need special + * consideration, as quota files are cleaned up later than user files. + */ + if (IS_ENCRYPTED(inode)) + return -EINVAL; + dqopt->files[type] = igrab(inode); if (!dqopt->files[type]) return -EIO; @@ -2359,15 +2385,14 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, struct quota_info *dqopt = sb_dqopt(sb); int error; + lockdep_assert_held_write(&sb->s_umount); + /* Just unsuspend quotas? */ BUG_ON(flags & DQUOT_SUSPENDED); - /* s_umount should be held in exclusive mode */ - if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) - up_read(&sb->s_umount); if (!fmt) return -ESRCH; - if (!sb->s_op->quota_write || !sb->s_op->quota_read || + if (!sb->dq_op || !sb->s_qcop || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; @@ -2951,6 +2976,7 @@ static int __init dquot_init(void) { int i, ret; unsigned long nr_hash, order; + struct shrinker *dqcache_shrinker; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); @@ -2985,8 +3011,14 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - if (register_shrinker(&dqcache_shrinker, "dquota-cache")) - panic("Cannot register dquot shrinker"); + dqcache_shrinker = shrinker_alloc(0, "dquota-cache"); + if (!dqcache_shrinker) + panic("Cannot allocate dquot shrinker"); + + dqcache_shrinker->count_objects = dqcache_shrink_count; + dqcache_shrinker->scan_objects = dqcache_shrink_scan; + + shrinker_register(dqcache_shrinker); return 0; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index fef477c78107..4ac05a9e25bc 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } return error; } @@ -138,7 +138,8 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (!error) { d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); } else iput(inode); } diff --git a/fs/read_write.c b/fs/read_write.c index b07de77ef126..4771701c896b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(vfs_setpos); * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek - * @size: max size of this file in file system + * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index 4d22ecfe0fab..0e6fe26458fe 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config REISERFS_FS tristate "Reiserfs support (deprecated)" + select BUFFER_HEAD select CRC32 select LEGACY_DIRECT_IO help diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index fefe87e1c099..6c13a8d9a73c 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2252,8 +2252,9 @@ static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) return sizeof(struct virtual_node) + max(max_num_of_items * sizeof(struct virtual_item), - sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + - (max_num_of_entries - 1) * sizeof(__u16)); + sizeof(struct virtual_item) + + struct_size_t(struct direntry_uarea, entry_sizes, + max_num_of_entries)); } /* diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 77bd3b27059f..1d825459ee6e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1257,12 +1257,9 @@ static void init_inode(struct inode *inode, struct treepath *path) i_uid_write(inode, sd_v1_uid(sd)); i_gid_write(inode, sd_v1_gid(sd)); inode->i_size = sd_v1_size(sd); - inode->i_atime.tv_sec = sd_v1_atime(sd); - inode->i_mtime.tv_sec = sd_v1_mtime(sd); - inode->i_ctime.tv_sec = sd_v1_ctime(sd); - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; + inode_set_atime(inode, sd_v1_atime(sd), 0); + inode_set_mtime(inode, sd_v1_mtime(sd), 0); + inode_set_ctime(inode, sd_v1_ctime(sd), 0); inode->i_blocks = sd_v1_blocks(sd); inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); @@ -1312,12 +1309,9 @@ static void init_inode(struct inode *inode, struct treepath *path) i_uid_write(inode, sd_v2_uid(sd)); inode->i_size = sd_v2_size(sd); i_gid_write(inode, sd_v2_gid(sd)); - inode->i_mtime.tv_sec = sd_v2_mtime(sd); - inode->i_atime.tv_sec = sd_v2_atime(sd); - inode->i_ctime.tv_sec = sd_v2_ctime(sd); - inode->i_ctime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; + inode_set_mtime(inode, sd_v2_mtime(sd), 0); + inode_set_atime(inode, sd_v2_atime(sd), 0); + inode_set_ctime(inode, sd_v2_ctime(sd), 0); inode->i_blocks = sd_v2_blocks(sd); rdev = sd_v2_rdev(sd); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) @@ -1372,9 +1366,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_uid(sd_v2, i_uid_read(inode)); set_sd_v2_size(sd_v2, size); set_sd_v2_gid(sd_v2, i_gid_read(inode)); - set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); - set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); - set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); + set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode)); + set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode)); + set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode)); set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); @@ -1393,9 +1387,9 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) set_sd_v1_gid(sd_v1, i_gid_read(inode)); set_sd_v1_nlink(sd_v1, inode->i_nlink); set_sd_v1_size(sd_v1, size); - set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); - set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); - set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); + set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode)); + set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode)); + set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode)); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); @@ -1986,7 +1980,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, /* uid and gid must already be set by the caller for quota init */ - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_size = i_size; inode->i_blocks = 0; inode->i_bytes = 0; @@ -2509,10 +2503,10 @@ out: * start/recovery path as __block_write_full_folio, along with special * code to handle reiserfs tails. */ -static int reiserfs_write_full_page(struct page *page, +static int reiserfs_write_full_folio(struct folio *folio, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned long end_index = inode->i_size >> PAGE_SHIFT; int error = 0; unsigned long block; @@ -2520,7 +2514,7 @@ static int reiserfs_write_full_page(struct page *page, struct buffer_head *head, *bh; int partial = 0; int nr = 0; - int checked = PageChecked(page); + int checked = folio_test_checked(folio); struct reiserfs_transaction_handle th; struct super_block *s = inode->i_sb; int bh_per_page = PAGE_SIZE / s->s_blocksize; @@ -2528,47 +2522,46 @@ static int reiserfs_write_full_page(struct page *page, /* no logging allowed when nonblocking or from PF_MEMALLOC */ if (checked && (current->flags & PF_MEMALLOC)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); return 0; } /* - * The page dirty bit is cleared before writepage is called, which + * The folio dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers - * The page really should be up to date at this point, so tossing + * The folio really should be up to date at this point, so tossing * in the BH_Uptodate is just a sanity check. */ - if (!page_has_buffers(page)) { - create_empty_buffers(page, s->s_blocksize, + head = folio_buffers(folio); + if (!head) + head = create_empty_buffers(folio, s->s_blocksize, (1 << BH_Dirty) | (1 << BH_Uptodate)); - } - head = page_buffers(page); /* - * last page in the file, zero out any contents past the + * last folio in the file, zero out any contents past the * last byte in the file */ - if (page->index >= end_index) { + if (folio->index >= end_index) { unsigned last_offset; last_offset = inode->i_size & (PAGE_SIZE - 1); - /* no file contents in this page */ - if (page->index >= end_index + 1 || !last_offset) { - unlock_page(page); + /* no file contents in this folio */ + if (folio->index >= end_index + 1 || !last_offset) { + folio_unlock(folio); return 0; } - zero_user_segment(page, last_offset, PAGE_SIZE); + folio_zero_segment(folio, last_offset, folio_size(folio)); } bh = head; - block = page->index << (PAGE_SHIFT - s->s_blocksize_bits); + block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits); last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; /* first map all the buffers, logging any direct items we find */ do { if (block > last_block) { /* * This can happen when the block size is less than - * the page size. The corresponding bytes in the page + * the folio size. The corresponding bytes in the folio * were zero filled above */ clear_buffer_dirty(bh); @@ -2595,7 +2588,7 @@ static int reiserfs_write_full_page(struct page *page, * blocks we're going to log */ if (checked) { - ClearPageChecked(page); + folio_clear_checked(folio); reiserfs_write_lock(s); error = journal_begin(&th, s, bh_per_page + 1); if (error) { @@ -2604,7 +2597,7 @@ static int reiserfs_write_full_page(struct page *page, } reiserfs_update_inode_transaction(inode); } - /* now go through and lock any dirty buffers on the page */ + /* now go through and lock any dirty buffers on the folio */ do { get_bh(bh); if (!buffer_mapped(bh)) @@ -2625,7 +2618,7 @@ static int reiserfs_write_full_page(struct page *page, lock_buffer(bh); } else { if (!trylock_buffer(bh)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); continue; } } @@ -2642,13 +2635,13 @@ static int reiserfs_write_full_page(struct page *page, if (error) goto fail; } - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); /* - * since any buffer might be the only dirty buffer on the page, - * the first submit_bh can bring the page out of writeback. + * since any buffer might be the only dirty buffer on the folio, + * the first submit_bh can bring the folio out of writeback. * be careful with the buffers. */ do { @@ -2665,10 +2658,10 @@ static int reiserfs_write_full_page(struct page *page, done: if (nr == 0) { /* - * if this page only had a direct item, it is very possible for + * if this folio only had a direct item, it is very possible for * no io to be required without there being an error. Or, * someone else could have locked them and sent them down the - * pipe without locking the page + * pipe without locking the folio */ bh = head; do { @@ -2679,18 +2672,18 @@ done: bh = bh->b_this_page; } while (bh != head); if (!partial) - SetPageUptodate(page); - end_page_writeback(page); + folio_mark_uptodate(folio); + folio_end_writeback(folio); } return error; fail: /* * catches various errors, we need to make sure any valid dirty blocks - * get to the media. The page is currently locked and not marked for + * get to the media. The folio is currently locked and not marked for * writeback */ - ClearPageUptodate(page); + folio_clear_uptodate(folio); bh = head; do { get_bh(bh); @@ -2700,16 +2693,16 @@ fail: } else { /* * clear any dirty bits that might have come from - * getting attached to a dirty page + * getting attached to a dirty folio */ clear_buffer_dirty(bh); } bh = bh->b_this_page; } while (bh != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + folio_set_error(folio); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { @@ -2730,9 +2723,10 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio) static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; reiserfs_wait_on_write_block(inode->i_sb); - return reiserfs_write_full_page(page, wbc); + return reiserfs_write_full_folio(folio, wbc); } static void reiserfs_truncate_failed_write(struct inode *inode) diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6bf9b54e58ca..dd33f8cc6eda 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -55,7 +55,7 @@ int reiserfs_fileattr_set(struct mnt_idmap *idmap, } sd_attrs_to_i_attrs(flags, inode); REISERFS_I(inode)->i_attrs = flags; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); err = 0; unlock: @@ -107,7 +107,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = -EFAULT; goto setversion_out; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); setversion_out: mnt_drop_write_file(filp); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 479aa4a57602..171c912af50f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -90,8 +90,7 @@ static int flush_commit_list(struct super_block *s, static int can_dirty(struct reiserfs_journal_cnode *cn); static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *sb); -static void release_journal_dev(struct super_block *super, - struct reiserfs_journal *journal); +static void release_journal_dev(struct reiserfs_journal *journal); static void dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl); static void flush_async_commits(struct work_struct *work); @@ -1893,7 +1892,7 @@ static void free_journal_ram(struct super_block *sb) * j_header_bh is on the journal dev, make sure * not to release the journal dev until we brelse j_header_bh */ - release_journal_dev(sb, journal); + release_journal_dev(journal); vfree(journal); } @@ -2326,7 +2325,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, int i, j; bh = __getblk(dev, block, bufsize); - if (buffer_uptodate(bh)) + if (!bh || buffer_uptodate(bh)) return (bh); if (block + BUFNR > max_block) { @@ -2336,6 +2335,8 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, j = 1; for (i = 1; i < blocks; i++) { bh = __getblk(dev, block + i, bufsize); + if (!bh) + break; if (buffer_uptodate(bh)) { brelse(bh); break; @@ -2385,7 +2386,7 @@ static int journal_read(struct super_block *sb) cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_info(sb, "checking transaction log (%pg)\n", - journal->j_dev_bd); + journal->j_bdev_handle->bdev); start = ktime_get_seconds(); /* @@ -2446,7 +2447,7 @@ static int journal_read(struct super_block *sb) * device and journal device to be the same */ d_bh = - reiserfs_breada(journal->j_dev_bd, cur_dblock, + reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock, sb->s_blocksize, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb)); @@ -2585,17 +2586,11 @@ static void journal_list_init(struct super_block *sb) SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); } -static void release_journal_dev(struct super_block *super, - struct reiserfs_journal *journal) +static void release_journal_dev(struct reiserfs_journal *journal) { - if (journal->j_dev_bd != NULL) { - void *holder = NULL; - - if (journal->j_dev_bd->bd_dev != super->s_dev) - holder = journal; - - blkdev_put(journal->j_dev_bd, holder); - journal->j_dev_bd = NULL; + if (journal->j_bdev_handle) { + bdev_release(journal->j_bdev_handle); + journal->j_bdev_handle = NULL; } } @@ -2610,7 +2605,7 @@ static int journal_init_dev(struct super_block *super, result = 0; - journal->j_dev_bd = NULL; + journal->j_bdev_handle = NULL; jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; @@ -2621,36 +2616,37 @@ static int journal_init_dev(struct super_block *super, if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) holder = NULL; - journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder, - NULL); - if (IS_ERR(journal->j_dev_bd)) { - result = PTR_ERR(journal->j_dev_bd); - journal->j_dev_bd = NULL; + journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode, + holder, NULL); + if (IS_ERR(journal->j_bdev_handle)) { + result = PTR_ERR(journal->j_bdev_handle); + journal->j_bdev_handle = NULL; reiserfs_warning(super, "sh-458", "cannot init journal device unknown-block(%u,%u): %i", MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) - set_blocksize(journal->j_dev_bd, super->s_blocksize); + set_blocksize(journal->j_bdev_handle->bdev, + super->s_blocksize); return 0; } - journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder, - NULL); - if (IS_ERR(journal->j_dev_bd)) { - result = PTR_ERR(journal->j_dev_bd); - journal->j_dev_bd = NULL; + journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode, + holder, NULL); + if (IS_ERR(journal->j_bdev_handle)) { + result = PTR_ERR(journal->j_bdev_handle); + journal->j_bdev_handle = NULL; reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; } - set_blocksize(journal->j_dev_bd, super->s_blocksize); + set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize); reiserfs_info(super, "journal_init_dev: journal device: %pg\n", - journal->j_dev_bd); + journal->j_bdev_handle->bdev); return 0; } @@ -2808,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, "journal header magic %x (device %pg) does " "not match to magic found in super block %x", jh->jh_journal.jp_journal_magic, - journal->j_dev_bd, + journal->j_bdev_handle->bdev, sb_jp_journal_magic(rs)); brelse(bhjh); goto free_and_return; @@ -2832,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, reiserfs_info(sb, "journal params: device %pg, size %u, " "journal first block %u, max trans len %u, max batch %u, " "max commit age %u, max trans age %u\n", - journal->j_dev_bd, + journal->j_bdev_handle->bdev, SB_ONDISK_JOURNAL_SIZE(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb), journal->j_trans_max, diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 52240cc891cf..994d6e6995ab 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } dir->i_size += paste_size; - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (!S_ISDIR(inode->i_mode) && visible) /* reiserfs_mkdir or reiserfs_rename will do that by itself */ reiserfs_update_sd(th, dir); @@ -966,7 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_nlink); clear_nlink(inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) @@ -1070,11 +1071,11 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) inc_nlink(inode); goto end_unlink; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); reiserfs_update_sd(&th, inode); dir->i_size -= (de.de_entrylen + DEH_SIZE); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); reiserfs_update_sd(&th, dir); if (!savelink) @@ -1250,7 +1251,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, return err ? err : retval; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); reiserfs_update_sd(&th, inode); ihold(inode); @@ -1325,7 +1326,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap, int jbegin_count; umode_t old_inode_mode; unsigned long savelink = 1; - struct timespec64 ctime; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -1576,14 +1576,11 @@ static int reiserfs_rename(struct mnt_idmap *idmap, mark_de_hidden(old_de.de_deh + old_de.de_entry_num); journal_mark_dirty(&th, old_de.de_bh); - ctime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; /* * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch * which adds ctime update of renamed object */ - old_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (new_dentry_inode) { /* adjust link number of the victim */ @@ -1592,7 +1589,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap, } else { drop_nlink(new_dentry_inode); } - new_dentry_inode->i_ctime = ctime; savelink = new_dentry_inode->i_nlink; } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 3dba8acf4e83..83cb9402e0f9 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused) "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), - SB_JOURNAL(sb)->j_dev_bd, + SB_JOURNAL(sb)->j_bdev_handle->bdev, DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 55e85256aae8..725667880e62 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -299,7 +299,7 @@ struct reiserfs_journal { /* oldest journal block. start here for traverse */ struct reiserfs_journal_cnode *j_first; - struct block_device *j_dev_bd; + struct bdev_handle *j_bdev_handle; /* first block on s_dev of reserved area journal */ int j_1st_reserved_block; @@ -1165,7 +1165,7 @@ static inline int bmap_would_wrap(unsigned bmap_nr) return bmap_nr > ((1LL << 16) - 1); } -extern const struct xattr_handler *reiserfs_xattr_handlers[]; +extern const struct xattr_handler * const reiserfs_xattr_handlers[]; /* * this says about version of key of all items (but stat data) the @@ -2373,7 +2373,7 @@ struct virtual_node { struct direntry_uarea { int flags; __u16 entry_count; - __u16 entry_sizes[1]; + __u16 entry_sizes[]; } __attribute__ ((__packed__)); /*************************************************************************** @@ -2699,7 +2699,7 @@ struct reiserfs_iget_args { #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) #define journal_trans_half(blocksize) \ - ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32)) + ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32)) /* journal.c see journal.c for all the comments here */ @@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc { __le32 j_len; __le32 j_mount_id; /* mount id of this trans */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id) @@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc { struct reiserfs_journal_commit { __le32 j_trans_id; /* must match j_trans_id from the desc block */ __le32 j_len; /* ditto */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) @@ -2809,9 +2809,12 @@ struct reiserfs_journal_header { #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) /* We need these to make journal.c code more readable */ -#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) -#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) -#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_find_get_block(s, block) __find_get_block(\ + SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize) +#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\ + block, s->s_blocksize) +#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\ + block, s->s_blocksize) enum reiserfs_bh_state_bits { BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */ diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index ce5003986789..2138ee7d271d 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -2003,8 +2003,9 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, pathrelse(&s_search_path); if (update_timestamps) { - inode->i_mtime = current_time(inode); - inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, + current_time(inode)); + inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); @@ -2028,8 +2029,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, update_and_out: if (update_timestamps) { /* this is truncate, not file closing */ - inode->i_mtime = current_time(inode); - inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, current_time(inode)); + inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 929acce6e731..67b5510beded 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2587,7 +2587,7 @@ out: return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 651027967159..998035a6388e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -466,12 +466,13 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { struct timespec64 now = current_time(inode); + struct timespec64 ctime = inode_get_ctime(inode); if (inode_unhashed(inode) || !inode->i_nlink || - timespec64_equal(&inode->i_ctime, &now)) + timespec64_equal(&ctime, &now)) return; - inode->i_ctime = current_time(inode); + inode_set_ctime_to_ts(inode, now); mark_inode_dirty(inode); } @@ -779,7 +780,7 @@ static inline bool reiserfs_posix_acl_list(const char *name, } /* This is the implementation for the xattr plugin infrastructure */ -static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers, +static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers, const char *name, struct dentry *dentry) { if (handlers) { @@ -910,7 +911,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -const struct xattr_handler *reiserfs_xattr_handlers[] = { +const struct xattr_handler * const reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 138060452678..064264992b49 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -285,7 +285,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, if (error == -ENODATA) { error = 0; if (type == ACL_TYPE_ACCESS) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } } diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 8eb87008b55a..f24a96a331af 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -57,6 +57,7 @@ endchoice config ROMFS_ON_BLOCK bool default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + select BUFFER_HEAD config ROMFS_ON_MTD bool diff --git a/fs/romfs/super.c b/fs/romfs/super.c index c59b230d55b4..545ad44f96b8 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -322,8 +322,8 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); - i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; - i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + inode_set_mtime_to_ts(i, + inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0))); /* set up mode and ops */ mode = romfs_modemap[nextfh & ROMFH_TYPE]; @@ -583,16 +583,18 @@ static int romfs_init_fs_context(struct fs_context *fc) */ static void romfs_kill_sb(struct super_block *sb) { + generic_shutdown_super(sb); + #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) { - kill_mtd_super(sb); - return; + put_mtd_device(sb->s_mtd); + sb->s_mtd = NULL; } #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { - kill_block_super(sb); - return; + sync_blockdev(sb->s_bdev); + bdev_release(sb->s_bdev_handle); } #endif } diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 4c0d53bf931a..2927bd174a88 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -3,6 +3,7 @@ config CIFS tristate "SMB3 and CIFS support (advanced network filesystem)" depends on INET select NLS + select NLS_UCS2_UTILS select CRYPTO select CRYPTO_MD5 select CRYPTO_SHA256 diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile index 304a7f6cc13a..0b07eb94c93b 100644 --- a/fs/smb/client/Makefile +++ b/fs/smb/client/Makefile @@ -11,7 +11,8 @@ cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \ readdir.o ioctl.o sess.o export.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ - dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o + dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o \ + namespace.o $(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h @@ -21,7 +22,7 @@ cifs-$(CONFIG_CIFS_XATTR) += xattr.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o -cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o dfs.o +cifs-$(CONFIG_CIFS_DFS_UPCALL) += dfs_cache.o dfs.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index fe483f163dbc..d64a306a414b 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -15,10 +15,12 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); +static void cfids_laundromat_worker(struct work_struct *work); static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, - bool lookup_only) + bool lookup_only, + __u32 max_cached_dirs) { struct cached_fid *cfid; @@ -30,7 +32,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, * fully cached or it may be in the process of * being deleted due to a lease break. */ - if (!cfid->has_lease) { + if (!cfid->time || !cfid->has_lease) { spin_unlock(&cfids->cfid_list_lock); return NULL; } @@ -43,7 +45,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, spin_unlock(&cfids->cfid_list_lock); return NULL; } - if (cfids->num_entries >= MAX_CACHED_FIDS) { + if (cfids->num_entries >= max_cached_dirs) { spin_unlock(&cfids->cfid_list_lock); return NULL; } @@ -145,7 +147,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, const char *npath; if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache || - is_smb1_server(tcon->ses->server)) + is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0)) return -EOPNOTSUPP; ses = tcon->ses; @@ -162,21 +164,24 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - cfid = find_or_create_cached_dir(cfids, path, lookup_only); + cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs); if (cfid == NULL) { kfree(utf16_path); return -ENOENT; } /* - * At this point we either have a lease already and we can just - * return it. If not we are guaranteed to be the only thread accessing - * this cfid. + * Return cached fid if it has a lease. Otherwise, it is either a new + * entry or laundromat worker removed it from @cfids->entries. Caller + * will put last reference if the latter. */ + spin_lock(&cfids->cfid_list_lock); if (cfid->has_lease) { + spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; kfree(utf16_path); return 0; } + spin_unlock(&cfids->cfid_list_lock); /* * Skip any prefix paths in @path as lookup_positive_unlocked() ends up @@ -188,10 +193,20 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, npath = path_no_prefix(cifs_sb, path); if (IS_ERR(npath)) { rc = PTR_ERR(npath); - kfree(utf16_path); - return rc; + goto out; } + if (!npath[0]) { + dentry = dget(cifs_sb->root); + } else { + dentry = path_to_dentry(cifs_sb, npath); + if (IS_ERR(dentry)) { + rc = -ENOENT; + goto out; + } + } + cfid->dentry = dentry; + /* * We do not hold the lock for the open because in case * SMB2_open needs to reconnect. @@ -218,7 +233,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, .tcon = tcon, .path = path, .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE), - .desired_access = FILE_READ_ATTRIBUTES, + .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES, .disposition = FILE_OPEN, .fid = pfid, }; @@ -244,6 +259,15 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); + /* + * Set @cfid->has_lease to true before sending out compounded request so + * its lease reference can be put in cached_dir_lease_break() due to a + * potential lease break right after the request is sent or while @cfid + * is still being cached. Concurrent processes won't be to use it yet + * due to @cfid->time being zero. + */ + cfid->has_lease = true; + rc = compound_send_recv(xid, ses, server, flags, 2, rqst, resp_buftype, rsp_iov); @@ -258,6 +282,8 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, cfid->tcon = tcon; cfid->is_open = true; + spin_lock(&cfids->cfid_list_lock); + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; @@ -265,18 +291,32 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ - if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) + + if (o_rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) { + spin_unlock(&cfids->cfid_list_lock); + rc = -EINVAL; goto oshr_free; + } + + rc = smb2_parse_contexts(server, rsp_iov, + &oparms.fid->epoch, + oparms.fid->lease_key, + &oplock, NULL, NULL); + if (rc) { + spin_unlock(&cfids->cfid_list_lock); + goto oshr_free; + } - smb2_parse_contexts(server, o_rsp, - &oparms.fid->epoch, - oparms.fid->lease_key, &oplock, - NULL, NULL); - if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) + rc = -EINVAL; + if (!(oplock & SMB2_LEASE_READ_CACHING_HE)) { + spin_unlock(&cfids->cfid_list_lock); goto oshr_free; + } qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; - if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) + if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) { + spin_unlock(&cfids->cfid_list_lock); goto oshr_free; + } if (!smb2_validate_and_copy_iov( le16_to_cpu(qi_rsp->OutputBufferOffset), sizeof(struct smb2_file_all_info), @@ -284,57 +324,49 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, (char *)&cfid->file_all_info)) cfid->file_all_info_is_valid = true; - if (!npath[0]) - dentry = dget(cifs_sb->root); - else { - dentry = path_to_dentry(cifs_sb, npath); - if (IS_ERR(dentry)) { - rc = -ENOENT; - goto oshr_free; - } - } - cfid->dentry = dentry; cfid->time = jiffies; - cfid->has_lease = true; + spin_unlock(&cfids->cfid_list_lock); + /* At this point the directory handle is fully cached */ + rc = 0; oshr_free: - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - spin_lock(&cfids->cfid_list_lock); - if (rc && !cfid->has_lease) { + if (rc) { + spin_lock(&cfids->cfid_list_lock); if (cfid->on_list) { list_del(&cfid->entry); cfid->on_list = false; cfids->num_entries--; } - rc = -ENOENT; - } - spin_unlock(&cfids->cfid_list_lock); - if (!rc && !cfid->has_lease) { - /* - * We are guaranteed to have two references at this point. - * One for the caller and one for a potential lease. - * Release the Lease-ref so that the directory will be closed - * when the caller closes the cached handle. - */ - kref_put(&cfid->refcount, smb2_close_cached_fid); + if (cfid->has_lease) { + /* + * We are guaranteed to have two references at this + * point. One for the caller and one for a potential + * lease. Release the Lease-ref so that the directory + * will be closed when the caller closes the cached + * handle. + */ + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + goto out; + } + spin_unlock(&cfids->cfid_list_lock); } +out: if (rc) { if (cfid->is_open) SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, cfid->fid.volatile_fid); free_cached_dir(cfid); - cfid = NULL; - } - - if (rc == 0) { + } else { *ret_cfid = cfid; atomic_inc(&tcon->num_remote_opens); } - + kfree(utf16_path); return rc; } @@ -451,6 +483,9 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) struct cached_fid *cfid, *q; LIST_HEAD(entry); + if (cfids == NULL) + return; + spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { list_move(&cfid->entry, &entry); @@ -568,53 +603,51 @@ static void free_cached_dir(struct cached_fid *cfid) kfree(cfid); } -static int -cifs_cfids_laundromat_thread(void *p) +static void cfids_laundromat_worker(struct work_struct *work) { - struct cached_fids *cfids = p; + struct cached_fids *cfids; struct cached_fid *cfid, *q; - struct list_head entry; + LIST_HEAD(entry); - while (!kthread_should_stop()) { - ssleep(1); - INIT_LIST_HEAD(&entry); - if (kthread_should_stop()) - return 0; - spin_lock(&cfids->cfid_list_lock); - list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - if (time_after(jiffies, cfid->time + HZ * 30)) { - list_del(&cfid->entry); - list_add(&cfid->entry, &entry); - cfids->num_entries--; - } - } - spin_unlock(&cfids->cfid_list_lock); + cfids = container_of(work, struct cached_fids, laundromat_work.work); - list_for_each_entry_safe(cfid, q, &entry, entry) { + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + if (cfid->time && + time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { cfid->on_list = false; - list_del(&cfid->entry); + list_move(&cfid->entry, &entry); + cfids->num_entries--; + /* To prevent race with smb2_cached_lease_break() */ + kref_get(&cfid->refcount); + } + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + /* + * Cancel and wait for the work to finish in case we are racing + * with it. + */ + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { /* - * Cancel, and wait for the work to finish in - * case we are racing with it. + * Our lease has not yet been cancelled from the server + * so we need to drop the reference. */ - cancel_work_sync(&cfid->lease_break); - if (cfid->has_lease) { - /* - * We lease has not yet been cancelled from - * the server so we need to drop the reference. - */ - spin_lock(&cfids->cfid_list_lock); - cfid->has_lease = false; - spin_unlock(&cfids->cfid_list_lock); - kref_put(&cfid->refcount, smb2_close_cached_fid); - } + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); } + /* Drop the extra reference opened above */ + kref_put(&cfid->refcount, smb2_close_cached_fid); } - - return 0; + queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + dir_cache_timeout * HZ); } - struct cached_fids *init_cached_dirs(void) { struct cached_fids *cfids; @@ -625,19 +658,10 @@ struct cached_fids *init_cached_dirs(void) spin_lock_init(&cfids->cfid_list_lock); INIT_LIST_HEAD(&cfids->entries); - /* - * since we're in a cifs function already, we know that - * this will succeed. No need for try_module_get(). - */ - __module_get(THIS_MODULE); - cfids->laundromat = kthread_run(cifs_cfids_laundromat_thread, - cfids, "cifsd-cfid-laundromat"); - if (IS_ERR(cfids->laundromat)) { - cifs_dbg(VFS, "Failed to start cfids laundromat thread.\n"); - kfree(cfids); - module_put(THIS_MODULE); - return NULL; - } + INIT_DELAYED_WORK(&cfids->laundromat_work, cfids_laundromat_worker); + queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + dir_cache_timeout * HZ); + return cfids; } @@ -650,11 +674,10 @@ void free_cached_dirs(struct cached_fids *cfids) struct cached_fid *cfid, *q; LIST_HEAD(entry); - if (cfids->laundromat) { - kthread_stop(cfids->laundromat); - cfids->laundromat = NULL; - module_put(THIS_MODULE); - } + if (cfids == NULL) + return; + + cancel_delayed_work_sync(&cfids->laundromat_work); spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index facc9b154d00..81ba0fd5cc16 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -49,7 +49,7 @@ struct cached_fid { struct cached_dirents dirents; }; -#define MAX_CACHED_FIDS 16 +/* default MAX_CACHED_FIDS is 16 */ struct cached_fids { /* Must be held when: * - accessing the cfids->entries list @@ -57,7 +57,7 @@ struct cached_fids { spinlock_t cfid_list_lock; int num_entries; struct list_head entries; - struct task_struct *laundromat; + struct delayed_work laundromat_work; }; extern struct cached_fids *init_cached_dirs(void); diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index aec6e9137474..60027f5aebe8 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -40,11 +40,13 @@ void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) #ifdef CONFIG_CIFS_DEBUG2 struct smb_hdr *smb = buf; - cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", - smb->Command, smb->Status.CifsError, - smb->Flags, smb->Flags2, smb->Mid, smb->Pid); - cifs_dbg(VFS, "smb buf %p len %u\n", smb, - server->ops->calc_smb_size(smb)); + cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d Wct: %d\n", + smb->Command, smb->Status.CifsError, smb->Flags, + smb->Flags2, smb->Mid, smb->Pid, smb->WordCount); + if (!server->ops->check_message(buf, server->total_read, server)) { + cifs_dbg(VFS, "smb buf %p len %u\n", smb, + server->ops->calc_smb_size(smb)); + } #endif /* CONFIG_CIFS_DEBUG2 */ } @@ -136,6 +138,11 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) { struct TCP_Server_Info *server = chan->server; + if (!server) { + seq_printf(m, "\n\n\t\tChannel: %d DISABLED", i+1); + return; + } + seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx" "\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x" "\n\t\tTCP status: %d Instance: %d" @@ -279,6 +286,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifs_server_iface *iface; + size_t iface_weight = 0, iface_min_speed = 0; + struct cifs_server_iface *last_iface = NULL; int c, i, j; seq_puts(m, @@ -331,7 +340,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { /* channel info will be printed as a part of sessions below */ - if (CIFS_SERVER_IS_CHAN(server)) + if (SERVER_IS_CHAN(server)) continue; c++; @@ -427,6 +436,8 @@ skip_rdma: if (server->nosharesock) seq_printf(m, " nosharesock"); + seq_printf(m, "\nServer capabilities: 0x%x", server->capabilities); + if (server->rdma) seq_printf(m, "\nRDMA "); seq_printf(m, "\nTCP status: %d Instance: %d" @@ -452,6 +463,11 @@ skip_rdma: seq_printf(m, "\n\n\tSessions: "); i = 0; list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); + continue; + } i++; if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || @@ -472,6 +488,7 @@ skip_rdma: ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->ses_status); } + spin_unlock(&ses->ses_lock); seq_printf(m, "\n\tSecurity type: %s ", get_security_type_str(server->ops->select_sectype(server, ses->sectype))); @@ -536,11 +553,25 @@ skip_rdma: "\tLast updated: %lu seconds ago", ses->iface_count, (jiffies - ses->iface_last_update) / HZ); + + last_iface = list_last_entry(&ses->iface_list, + struct cifs_server_iface, + iface_head); + iface_min_speed = last_iface->speed; + j = 0; list_for_each_entry(iface, &ses->iface_list, iface_head) { seq_printf(m, "\n\t%d)", ++j); cifs_dump_iface(m, iface); + + iface_weight = iface->speed / iface_min_speed; + seq_printf(m, "\t\tWeight (cur,total): (%zu,%zu)" + "\n\t\tAllocated channels: %u\n", + iface->weight_fulfilled, + iface_weight, + iface->num_channels); + if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); } @@ -738,14 +769,14 @@ static ssize_t name##_write(struct file *file, const char __user *buffer, \ size_t count, loff_t *ppos) \ { \ int rc; \ - rc = kstrtoint_from_user(buffer, count, 10, & name); \ + rc = kstrtoint_from_user(buffer, count, 10, &name); \ if (rc) \ return rc; \ return count; \ } \ static int name##_proc_show(struct seq_file *m, void *v) \ { \ - seq_printf(m, "%d\n", name ); \ + seq_printf(m, "%d\n", name); \ return 0; \ } \ static int name##_open(struct inode *inode, struct file *file) \ diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h index 332588e77c31..26327442e383 100644 --- a/fs/smb/client/cifs_ioctl.h +++ b/fs/smb/client/cifs_ioctl.h @@ -26,6 +26,11 @@ struct smb_mnt_fs_info { __u64 cifs_posix_caps; } __packed; +struct smb_mnt_tcon_info { + __u32 tid; + __u64 session_id; +} __packed; + struct smb_snapshot_array { __u32 number_of_snapshots; __u32 number_of_snapshots_returned; @@ -108,6 +113,7 @@ struct smb3_notify_info { #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) #define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) #define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info) +#define CIFS_IOC_GET_TCON_INFO _IOR(CIFS_IOCTL_MAGIC, 12, struct smb_mnt_tcon_info) #define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 6f3285f1dfee..af7849e5974f 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -64,8 +64,8 @@ struct key_type cifs_spnego_key_type = { * strlen(";sec=ntlmsspi") */ #define MAX_MECH_STR_LEN 13 -/* strlen of "host=" */ -#define HOST_KEY_LEN 5 +/* strlen of ";host=" */ +#define HOST_KEY_LEN 6 /* strlen of ";ip4=" or ";ip6=" */ #define IP_KEY_LEN 5 diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c index e7582dd79179..79d99a913944 100644 --- a/fs/smb/client/cifs_unicode.c +++ b/fs/smb/client/cifs_unicode.c @@ -8,7 +8,6 @@ #include <linux/slab.h> #include "cifs_fs_sb.h" #include "cifs_unicode.h" -#include "cifs_uniupr.h" #include "cifspdu.h" #include "cifsglob.h" #include "cifs_debug.h" diff --git a/fs/smb/client/cifs_unicode.h b/fs/smb/client/cifs_unicode.h index 80b3d845419f..e137a0dfbbe9 100644 --- a/fs/smb/client/cifs_unicode.h +++ b/fs/smb/client/cifs_unicode.h @@ -21,21 +21,7 @@ #include <asm/byteorder.h> #include <linux/types.h> #include <linux/nls.h> - -#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ - -/* - * Windows maps these to the user defined 16 bit Unicode range since they are - * reserved symbols (along with \ and /), otherwise illegal to store - * in filenames in NTFS - */ -#define UNI_ASTERISK (__u16) ('*' + 0xF000) -#define UNI_QUESTION (__u16) ('?' + 0xF000) -#define UNI_COLON (__u16) (':' + 0xF000) -#define UNI_GRTRTHAN (__u16) ('>' + 0xF000) -#define UNI_LESSTHAN (__u16) ('<' + 0xF000) -#define UNI_PIPE (__u16) ('|' + 0xF000) -#define UNI_SLASH (__u16) ('\\' + 0xF000) +#include "../../nls/nls_ucs2_utils.h" /* * Macs use an older "SFM" mapping of the symbols above. Fortunately it does @@ -68,27 +54,6 @@ #define SFM_MAP_UNI_RSVD 1 #define SFU_MAP_UNI_RSVD 2 -/* Just define what we want from uniupr.h. We don't want to define the tables - * in each source file. - */ -#ifndef UNICASERANGE_DEFINED -struct UniCaseRange { - wchar_t start; - wchar_t end; - signed char *table; -}; -#endif /* UNICASERANGE_DEFINED */ - -#ifndef UNIUPR_NOUPPER -extern signed char CifsUniUpperTable[512]; -extern const struct UniCaseRange CifsUniUpperRange[]; -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -extern signed char CifsUniLowerTable[512]; -extern const struct UniCaseRange CifsUniLowerRange[]; -#endif /* UNIUPR_NOLOWER */ - #ifdef __KERNEL__ int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, const struct nls_table *cp, int map_type); @@ -108,297 +73,4 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, wchar_t cifs_toupper(wchar_t in); -/* - * UniStrcat: Concatenate the second string to the first - * - * Returns: - * Address of the first string - */ -static inline __le16 * -UniStrcat(__le16 *ucs1, const __le16 *ucs2) -{ - __le16 *anchor = ucs1; /* save a pointer to start of ucs1 */ - - while (*ucs1++) ; /* To end of first string */ - ucs1--; /* Return to the null */ - while ((*ucs1++ = *ucs2++)) ; /* copy string 2 over */ - return anchor; -} - -/* - * UniStrchr: Find a character in a string - * - * Returns: - * Address of first occurrence of character in string - * or NULL if the character is not in the string - */ -static inline wchar_t * -UniStrchr(const wchar_t *ucs, wchar_t uc) -{ - while ((*ucs != uc) && *ucs) - ucs++; - - if (*ucs == uc) - return (wchar_t *) ucs; - return NULL; -} - -/* - * UniStrcmp: Compare two strings - * - * Returns: - * < 0: First string is less than second - * = 0: Strings are equal - * > 0: First string is greater than second - */ -static inline int -UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) -{ - while ((*ucs1 == *ucs2) && *ucs1) { - ucs1++; - ucs2++; - } - return (int) *ucs1 - (int) *ucs2; -} - -/* - * UniStrcpy: Copy a string - */ -static inline wchar_t * -UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) -{ - wchar_t *anchor = ucs1; /* save the start of result string */ - - while ((*ucs1++ = *ucs2++)) ; - return anchor; -} - -/* - * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) - */ -static inline size_t -UniStrlen(const wchar_t *ucs1) -{ - int i = 0; - - while (*ucs1++) - i++; - return i; -} - -/* - * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a - * string (length limited) - */ -static inline size_t -UniStrnlen(const wchar_t *ucs1, int maxlen) -{ - int i = 0; - - while (*ucs1++) { - i++; - if (i >= maxlen) - break; - } - return i; -} - -/* - * UniStrncat: Concatenate length limited string - */ -static inline wchar_t * -UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; /* save pointer to string 1 */ - - while (*ucs1++) ; - ucs1--; /* point to null terminator of s1 */ - while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ - ucs1++; - ucs2++; - } - *ucs1 = 0; /* Null terminate the result */ - return (anchor); -} - -/* - * UniStrncmp: Compare length limited string - */ -static inline int -UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == *ucs2) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int) *ucs1 - (int) *ucs2; -} - -/* - * UniStrncmp_le: Compare length limited string - native to little-endian - */ -static inline int -UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); -} - -/* - * UniStrncpy: Copy length limited string with pad - */ -static inline wchar_t * -UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = *ucs2++; - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrncpy_le: Copy length limited string with pad to little-endian - */ -static inline wchar_t * -UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = __le16_to_cpu(*ucs2++); - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrstr: Find a string in a string - * - * Returns: - * Address of first match found - * NULL if no matching string is found - */ -static inline wchar_t * -UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) -{ - const wchar_t *anchor1 = ucs1; - const wchar_t *anchor2 = ucs2; - - while (*ucs1) { - if (*ucs1 == *ucs2) { - /* Partial match found */ - ucs1++; - ucs2++; - } else { - if (!*ucs2) /* Match found */ - return (wchar_t *) anchor1; - ucs1 = ++anchor1; /* No match */ - ucs2 = anchor2; - } - } - - if (!*ucs2) /* Both end together */ - return (wchar_t *) anchor1; /* Match found */ - return NULL; /* No match */ -} - -#ifndef UNIUPR_NOUPPER -/* - * UniToupper: Convert a unicode character to upper case - */ -static inline wchar_t -UniToupper(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(CifsUniUpperTable)) { - /* Latin characters */ - return uc + CifsUniUpperTable[uc]; /* Use base tables */ - } else { - rp = CifsUniUpperRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - } - return uc; /* Past last range */ -} - -/* - * UniStrupr: Upper case a unicode string - */ -static inline __le16 * -UniStrupr(register __le16 *upin) -{ - register __le16 *up; - - up = upin; - while (*up) { /* For all characters */ - *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); - up++; - } - return upin; /* Return input pointer */ -} -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -/* - * UniTolower: Convert a unicode character to lower case - */ -static inline wchar_t -UniTolower(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(CifsUniLowerTable)) { - /* Latin characters */ - return uc + CifsUniLowerTable[uc]; /* Use base tables */ - } else { - rp = CifsUniLowerRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - } - return uc; /* Past last range */ -} - -/* - * UniStrlwr: Lower case a unicode string - */ -static inline wchar_t * -UniStrlwr(register wchar_t *upin) -{ - register wchar_t *up; - - up = upin; - while (*up) { /* For all characters */ - *up = UniTolower(*up); - up++; - } - return upin; /* Return input pointer */ -} - -#endif - #endif /* _CIFS_UNICODE_H */ diff --git a/fs/smb/client/cifs_uniupr.h b/fs/smb/client/cifs_uniupr.h deleted file mode 100644 index 7b272fcdf0d3..000000000000 --- a/fs/smb/client/cifs_uniupr.h +++ /dev/null @@ -1,239 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) International Business Machines Corp., 2000,2002 - * - * uniupr.h - Unicode compressed case ranges -*/ - -#ifndef UNIUPR_NOUPPER -/* - * Latin upper case - */ -signed char CifsUniUpperTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ - 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 060-06f */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 0e0-0ef */ - -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, /* 0f0-0ff */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ - 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ - 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ - 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ - -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ - -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ - 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ -}; - -/* Upper case range - Greek */ -static signed char UniCaseRangeU03a0[47] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, /* 3a0-3af */ - 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 3b0-3bf */ - -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64, - -63, -63, -}; - -/* Upper case range - Cyrillic */ -static signed char UniCaseRangeU0430[48] = { - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 430-43f */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 440-44f */ - 0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -80, -80, /* 450-45f */ -}; - -/* Upper case range - Extended cyrillic */ -static signed char UniCaseRangeU0490[61] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ - 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, -}; - -/* Upper case range - Extended latin and greek */ -static signed char UniCaseRangeU1e00[509] = { - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ - 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1, /* 1e90-1e9f */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ - 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ - 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ - 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ - 74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, 126, 126, 0, 0, /* 1f70-1f7f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ - 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ - 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ - 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ - 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Upper case range - Wide latin */ -static signed char UniCaseRangeUff40[27] = { - 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* ff40-ff4f */ - -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -}; - -/* - * Upper Case Range - */ -const struct UniCaseRange CifsUniUpperRange[] = { - {0x03a0, 0x03ce, UniCaseRangeU03a0}, - {0x0430, 0x045f, UniCaseRangeU0430}, - {0x0490, 0x04cc, UniCaseRangeU0490}, - {0x1e00, 0x1ffc, UniCaseRangeU1e00}, - {0xff40, 0xff5a, UniCaseRangeUff40}, - {0} -}; -#endif - -#ifndef UNIUPR_NOLOWER -/* - * Latin lower case - */ -signed char CifsUniLowerTable[512] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 040-04f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, /* 050-05f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 0c0-0cf */ - 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 0, /* 0d0-0df */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */ - 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */ - 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, 0, /* 170-17f */ - 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, 0, /* 180-18f */ - 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */ - 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */ - 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */ - 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */ - 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */ - 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */ -}; - -/* Lower case range - Greek */ -static signed char UniCaseRangeL0380[44] = { - 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */ - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 390-39f */ - 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* Lower case range - Cyrillic */ -static signed char UniCaseRangeL0400[48] = { - 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 0, 80, 80, /* 400-40f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 410-41f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 420-42f */ -}; - -/* Lower case range - Extended cyrillic */ -static signed char UniCaseRangeL0490[60] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */ - 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, -}; - -/* Lower case range - Extended latin and greek */ -static signed char UniCaseRangeL1e00[504] = { - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */ - 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */ - 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */ - 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, 0, 0, /* 1fc0-1fcf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */ - 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, 0, 0, /* 1fe0-1fef */ - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/* Lower case range - Wide latin */ -static signed char UniCaseRangeLff20[27] = { - 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* ff20-ff2f */ - 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, -}; - -/* - * Lower Case Range - */ -const struct UniCaseRange CifsUniLowerRange[] = { - {0x0380, 0x03ab, UniCaseRangeL0380}, - {0x0400, 0x042f, UniCaseRangeL0400}, - {0x0490, 0x04cb, UniCaseRangeL0490}, - {0x1e00, 0x1ff7, UniCaseRangeL1e00}, - {0xff20, 0xff3a, UniCaseRangeLff20}, - {0} -}; -#endif diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index a4d8b0ea1c8c..2131638f26d0 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -117,6 +117,10 @@ module_param(cifs_max_pending, uint, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for " "CIFS/SMB1 dialect (N/A for SMB3) " "Default: 32767 Range: 2 to 32767."); +unsigned int dir_cache_timeout = 30; +module_param(dir_cache_timeout, uint, 0644); +MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 " + "Range: 1 to 65000 seconds, 0 to disable caching dir contents"); #ifdef CONFIG_CIFS_STATS2 unsigned int slow_rsp_threshold = 1; module_param(slow_rsp_threshold, uint, 0644); @@ -695,6 +699,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); if (tcon->handle_timeout) seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); + if (tcon->max_cached_dirs != MAX_CACHED_FIDS) + seq_printf(s, ",max_cached_dirs=%u", tcon->max_cached_dirs); /* * Display file and directory attribute timeout in seconds. @@ -1077,7 +1083,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) } static int -cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv) +cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to @@ -1185,36 +1191,108 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode, const struct inode_operations cifs_symlink_inode_ops = { .get_link = cifs_get_link, + .setattr = cifs_setattr, .permission = cifs_permission, .listxattr = cifs_listxattr, }; +/* + * Advance the EOF marker to after the source range. + */ +static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *src_cifsi, + struct cifs_tcon *src_tcon, + unsigned int xid, loff_t src_end) +{ + struct cifsFileInfo *writeable_srcfile; + int rc = -EINVAL; + + writeable_srcfile = find_writable_file(src_cifsi, FIND_WR_FSUID_ONLY); + if (writeable_srcfile) { + if (src_tcon->ses->server->ops->set_file_size) + rc = src_tcon->ses->server->ops->set_file_size( + xid, src_tcon, writeable_srcfile, + src_inode->i_size, true /* no need to set sparse */); + else + rc = -ENOSYS; + cifsFileInfo_put(writeable_srcfile); + cifs_dbg(FYI, "SetFSize for copychunk rc = %d\n", rc); + } + + if (rc < 0) + goto set_failed; + + netfs_resize_file(&src_cifsi->netfs, src_end); + fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end); + return 0; + +set_failed: + return filemap_write_and_wait(src_inode->i_mapping); +} + +/* + * Flush out either the folio that overlaps the beginning of a range in which + * pos resides or the folio that overlaps the end of a range unless that folio + * is entirely within the range we're going to invalidate. We extend the flush + * bounds to encompass the folio. + */ +static int cifs_flush_folio(struct inode *inode, loff_t pos, loff_t *_fstart, loff_t *_fend, + bool first) +{ + struct folio *folio; + unsigned long long fpos, fend; + pgoff_t index = pos / PAGE_SIZE; + size_t size; + int rc = 0; + + folio = filemap_get_folio(inode->i_mapping, index); + if (IS_ERR(folio)) + return 0; + + size = folio_size(folio); + fpos = folio_pos(folio); + fend = fpos + size - 1; + *_fstart = min_t(unsigned long long, *_fstart, fpos); + *_fend = max_t(unsigned long long, *_fend, fend); + if ((first && pos == fpos) || (!first && pos == fend)) + goto out; + + rc = filemap_write_and_wait_range(inode->i_mapping, fpos, fend); +out: + folio_put(folio); + return rc; +} + static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); + struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode); + struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode); struct cifsFileInfo *smb_file_src = src_file->private_data; - struct cifsFileInfo *smb_file_target; - struct cifs_tcon *target_tcon; + struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct cifs_tcon *target_tcon, *src_tcon; + unsigned long long destend, fstart, fend, new_size; unsigned int xid; int rc; - if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + if (remap_flags & ~REMAP_FILE_ADVISORY) return -EINVAL; cifs_dbg(FYI, "clone range\n"); xid = get_xid(); - if (!src_file->private_data || !dst_file->private_data) { + if (!smb_file_src || !smb_file_target) { rc = -EBADF; cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); goto out; } - smb_file_target = dst_file->private_data; + src_tcon = tlink_tcon(smb_file_src->tlink); target_tcon = tlink_tcon(smb_file_target->tlink); /* @@ -1227,20 +1305,63 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off, if (len == 0) len = src_inode->i_size - off; - cifs_dbg(FYI, "about to flush pages\n"); - /* should we flush first and last page first */ - truncate_inode_pages_range(&target_inode->i_data, destoff, - PAGE_ALIGN(destoff + len)-1); + cifs_dbg(FYI, "clone range\n"); + + /* Flush the source buffer */ + rc = filemap_write_and_wait_range(src_inode->i_mapping, off, + off + len - 1); + if (rc) + goto unlock; + + /* The server-side copy will fail if the source crosses the EOF marker. + * Advance the EOF marker after the flush above to the end of the range + * if it's short of that. + */ + if (src_cifsi->netfs.remote_i_size < off + len) { + rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); + if (rc < 0) + goto unlock; + } + + new_size = destoff + len; + destend = destoff + len - 1; + + /* Flush the folios at either end of the destination range to prevent + * accidental loss of dirty data outside of the range. + */ + fstart = destoff; + fend = destend; + + rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true); + if (rc) + goto unlock; + rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); + if (rc) + goto unlock; + + /* Discard all the folios that overlap the destination region. */ + cifs_dbg(FYI, "about to discard pages %llx-%llx\n", fstart, fend); + truncate_inode_pages_range(&target_inode->i_data, fstart, fend); - if (target_tcon->ses->server->ops->duplicate_extents) + fscache_invalidate(cifs_inode_cookie(target_inode), NULL, + i_size_read(target_inode), 0); + + rc = -EOPNOTSUPP; + if (target_tcon->ses->server->ops->duplicate_extents) { rc = target_tcon->ses->server->ops->duplicate_extents(xid, smb_file_src, smb_file_target, off, len, destoff); - else - rc = -EOPNOTSUPP; + if (rc == 0 && new_size > i_size_read(target_inode)) { + truncate_setsize(target_inode, new_size); + netfs_resize_file(&target_cifsi->netfs, new_size); + fscache_resize_cookie(cifs_inode_cookie(target_inode), + new_size); + } + } /* force revalidate of size and timestamps of target file now that target is updated on the server */ CIFS_I(target_inode)->time = 0; +unlock: /* although unlocking in the reverse order from locking is not strictly necessary here it is a little cleaner to be consistent */ unlock_two_nondirectories(src_inode, target_inode); @@ -1256,10 +1377,12 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, { struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); + struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode); struct cifsFileInfo *smb_file_src; struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; struct cifs_tcon *target_tcon; + unsigned long long destend, fstart, fend; ssize_t rc; cifs_dbg(FYI, "copychunk range\n"); @@ -1299,13 +1422,41 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, if (rc) goto unlock; - /* should we flush first and last page first */ - truncate_inode_pages(&target_inode->i_data, 0); + /* The server-side copy will fail if the source crosses the EOF marker. + * Advance the EOF marker after the flush above to the end of the range + * if it's short of that. + */ + if (src_cifsi->server_eof < off + len) { + rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len); + if (rc < 0) + goto unlock; + } + + destend = destoff + len - 1; + + /* Flush the folios at either end of the destination range to prevent + * accidental loss of dirty data outside of the range. + */ + fstart = destoff; + fend = destend; + + rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true); + if (rc) + goto unlock; + rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false); + if (rc) + goto unlock; + + /* Discard all the folios that overlap the destination region. */ + truncate_inode_pages_range(&target_inode->i_data, fstart, fend); rc = file_modified(dst_file); - if (!rc) + if (!rc) { rc = target_tcon->ses->server->ops->copychunk_range(xid, smb_file_src, smb_file_target, off, len, destoff); + if (rc > 0 && destoff + rc > i_size_read(target_inode)) + truncate_setsize(target_inode, destoff + rc); + } file_accessed(src_file); @@ -1679,6 +1830,12 @@ init_cifs(void) CIFS_MAX_REQ); } + /* Limit max to about 18 hours, and setting to zero disables directory entry caching */ + if (dir_cache_timeout > 65000) { + dir_cache_timeout = 65000; + cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n"); + } + cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); if (!cifsiod_wq) { rc = -ENOMEM; @@ -1805,7 +1962,7 @@ exit_cifs(void) cifs_dbg(NOISY, "exit_smb3\n"); unregister_filesystem(&cifs_fs_type); unregister_filesystem(&smb3_fs_type); - cifs_dfs_release_automount_timer(); + cifs_release_automount_timer(); exit_cifs_idmap(); #ifdef CONFIG_CIFS_SWN_UPCALL cifs_genl_exit(); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 15c8cc4b6680..3adea10aa9da 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -81,7 +81,7 @@ extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start, extern const struct inode_operations cifs_file_inode_ops; extern const struct inode_operations cifs_symlink_inode_ops; -extern const struct inode_operations cifs_dfs_referral_inode_operations; +extern const struct inode_operations cifs_namespace_inode_operations; /* Functions related to files and directories */ @@ -118,14 +118,7 @@ extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned extern const struct dentry_operations cifs_dentry_ops; extern const struct dentry_operations cifs_ci_dentry_ops; -#ifdef CONFIG_CIFS_DFS_UPCALL -extern struct vfsmount *cifs_dfs_d_automount(struct path *path); -#else -static inline struct vfsmount *cifs_dfs_d_automount(struct path *path) -{ - return ERR_PTR(-EREMOTE); -} -#endif +extern struct vfsmount *cifs_d_automount(struct path *path); /* Functions related to symlinks */ extern const char *cifs_get_link(struct dentry *, struct inode *, @@ -134,7 +127,7 @@ extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname); #ifdef CONFIG_CIFS_XATTR -extern const struct xattr_handler *cifs_xattr_handlers[]; +extern const struct xattr_handler * const cifs_xattr_handlers[]; extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); #else # define cifs_xattr_handlers NULL @@ -159,6 +152,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 44 -#define CIFS_VERSION "2.44" +#define SMB3_PRODUCT_BUILD 46 +#define CIFS_VERSION "2.46" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 657dee4b2c8c..55b3ce944022 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -186,6 +186,18 @@ struct cifs_cred { }; struct cifs_open_info_data { + bool adjust_tz; + union { + bool reparse_point; + bool symlink; + }; + struct { + __u32 tag; + union { + struct reparse_data_buffer *buf; + struct reparse_posix_data *posix; + }; + } reparse; char *symlink_target; union { struct smb2_file_all_info fi; @@ -193,6 +205,10 @@ struct cifs_open_info_data { }; }; +#define cifs_open_data_reparse(d) \ + ((d)->reparse_point || \ + (le32_to_cpu((d)->fi.Attributes) & ATTR_REPARSE)) + static inline void cifs_free_open_info(struct cifs_open_info_data *data) { kfree(data->symlink_target); @@ -318,16 +334,21 @@ struct smb_version_operations { int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); /* query path data from the server */ - int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); + int (*query_path_info)(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data); /* query file data from the server */ int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile, struct cifs_open_info_data *data); - /* query reparse tag from srv to determine which type of special file */ - int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *path, - __u32 *reparse_tag); + /* query reparse point to determine which type of special file */ + int (*query_reparse_point)(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype); /* get server index number */ int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid, @@ -376,9 +397,11 @@ struct smb_version_operations { const char *, const char *, struct cifs_sb_info *); /* query symlink target */ - int (*query_symlink)(const unsigned int, struct cifs_tcon *, - struct cifs_sb_info *, const char *, - char **, bool); + int (*query_symlink)(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + char **target_path); /* open a file for non-posix mounts */ int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, void *buf); @@ -509,7 +532,8 @@ struct smb_version_operations { struct mid_q_entry **, char **, int *); enum securityEnum (*select_sectype)(struct TCP_Server_Info *, enum securityEnum); - int (*next_header)(char *); + int (*next_header)(struct TCP_Server_Info *server, char *buf, + unsigned int *noff); /* ioctl passthrough for query_info */ int (*ioctl_query_info)(const unsigned int xid, struct cifs_tcon *tcon, @@ -533,6 +557,9 @@ struct smb_version_operations { bool (*is_status_io_timeout)(char *buf); /* Check for STATUS_NETWORK_NAME_DELETED */ bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); + int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, + struct kvec *rsp_iov, + struct cifs_open_info_data *data); }; struct smb_version_values { @@ -632,6 +659,7 @@ struct TCP_Server_Info { bool noautotune; /* do not autotune send buf sizes */ bool nosharesock; bool tcp_nodelay; + bool terminate; unsigned int credits; /* send no more requests at once */ unsigned int max_credits; /* can override large 32000 default at mnt */ unsigned int in_flight; /* number of requests on the wire to server */ @@ -727,8 +755,9 @@ struct TCP_Server_Info { * primary_server holds the ref-counted * pointer to primary channel connection for the session. */ -#define CIFS_SERVER_IS_CHAN(server) (!!(server)->primary_server) +#define SERVER_IS_CHAN(server) (!!(server)->primary_server) struct TCP_Server_Info *primary_server; + __u16 channel_sequence_num; /* incremented on primary channel on each chan reconnect */ #ifdef CONFIG_CIFS_SWN_UPCALL bool use_swn_dstaddr; @@ -950,6 +979,8 @@ struct cifs_server_iface { struct list_head iface_head; struct kref refcount; size_t speed; + size_t weight_fulfilled; + unsigned int num_channels; unsigned int rdma_capable : 1; unsigned int rss_capable : 1; unsigned int is_active : 1; /* unset if non existent */ @@ -1031,6 +1062,7 @@ struct cifs_ses { spinlock_t chan_lock; /* ========= begin: protected by chan_lock ======== */ #define CIFS_MAX_CHANNELS 16 +#define CIFS_INVAL_CHAN_INDEX (-1) #define CIFS_ALL_CHANNELS_SET(ses) \ ((1UL << (ses)->chan_count) - 1) #define CIFS_ALL_CHANS_GOOD(ses) \ @@ -1076,7 +1108,7 @@ cap_unix(struct cifs_ses *ses) * inode with new info */ -#define CIFS_FATTR_DFS_REFERRAL 0x1 +#define CIFS_FATTR_JUNCTION 0x1 #define CIFS_FATTR_DELETE_PENDING 0x2 #define CIFS_FATTR_NEED_REVAL 0x4 #define CIFS_FATTR_INO_COLLISION 0x8 @@ -1191,6 +1223,7 @@ struct cifs_tcon { __u32 max_chunks; __u32 max_bytes_chunk; __u32 max_bytes_copy; + __u32 max_cached_dirs; #ifdef CONFIG_CIFS_FSCACHE u64 resource_id; /* server resource id */ struct fscache_volume *fscache; /* cookie for share */ @@ -1721,11 +1754,23 @@ struct cifs_mount_ctx { struct list_head dfs_ses_list; }; +static inline void __free_dfs_info_param(struct dfs_info3_param *param) +{ + kfree(param->path_name); + kfree(param->node_name); +} + static inline void free_dfs_info_param(struct dfs_info3_param *param) { + if (param) + __free_dfs_info_param(param); +} + +static inline void zfree_dfs_info_param(struct dfs_info3_param *param) +{ if (param) { - kfree(param->path_name); - kfree(param->node_name); + __free_dfs_info_param(param); + memset(param, 0, sizeof(*param)); } } @@ -1775,6 +1820,7 @@ static inline bool is_retryable_error(int error) #define MID_RETRY_NEEDED 8 /* session closed while this request out */ #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 +#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */ /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ @@ -1911,7 +1957,7 @@ require use of the stronger protocol */ * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc + * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search @@ -1985,6 +2031,7 @@ extern unsigned int CIFSMaxBufSize; /* max size not including hdr */ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ +extern unsigned int dir_cache_timeout; /* max time for directory lease caching of dir */ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ extern atomic_t mid_count; @@ -2109,6 +2156,7 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst, unsigned int len, skip; unsigned int nents = 0; unsigned long addr; + size_t data_size; int i, j; /* @@ -2124,17 +2172,21 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst, * rqst[1+].rq_iov[0+] data to be encrypted/decrypted */ for (i = 0; i < num_rqst; i++) { + data_size = iov_iter_count(&rqst[i].rq_iter); + /* We really don't want a mixture of pinned and unpinned pages * in the sglist. It's hard to keep track of which is what. * Instead, we convert to a BVEC-type iterator higher up. */ - if (WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter))) + if (data_size && + WARN_ON_ONCE(user_backed_iter(&rqst[i].rq_iter))) return -EIO; /* We also don't want to have any extra refs or pins to clean * up in the sglist. */ - if (WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter))) + if (data_size && + WARN_ON_ONCE(iov_iter_extract_will_pin(&rqst[i].rq_iter))) return -EIO; for (j = 0; j < rqst[i].rq_nvec; j++) { @@ -2150,7 +2202,8 @@ static inline int cifs_get_num_sgs(const struct smb_rqst *rqst, } skip = 0; } - nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX); + if (data_size) + nents += iov_iter_npages(&rqst[i].rq_iter, INT_MAX); } nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE); return nents; @@ -2184,4 +2237,17 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, } } +struct smb2_compound_vars { + struct cifs_open_parms oparms; + struct kvec rsp_iov[3]; + struct smb_rqst rqst[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec close_iov; + struct smb2_file_rename_info rename_info; + struct smb2_file_link_info link_info; +}; + #endif /* _CIFS_GLOB_H */ diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index e17222fec9d2..c0513fbb8a59 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -882,11 +882,13 @@ typedef struct smb_com_open_rsp { __u8 OplockLevel; __u16 Fid; __le32 CreateAction; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le32 FileAttributes; + struct_group(common_attributes, + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 FileAttributes; + ); __le64 AllocationSize; __le64 EndOfFile; __le16 FileType; @@ -1356,7 +1358,7 @@ typedef struct smb_com_transaction_ioctl_rsp { __le32 DataDisplacement; __u8 SetupCount; /* 1 */ __le16 ReturnedDataLen; - __u16 ByteCount; + __le16 ByteCount; } __attribute__((packed)) TRANSACT_IOCTL_RSP; #define CIFS_ACL_OWNER 1 @@ -1509,7 +1511,7 @@ struct reparse_posix_data { __le16 ReparseDataLength; __u16 Reserved; __le64 InodeType; /* LNK, FIFO, CHR etc. */ - char PathBuffer[]; + __u8 DataBuffer[]; } __attribute__((packed)); struct cifs_quota_data { @@ -2264,11 +2266,13 @@ typedef struct { /* QueryFileInfo/QueryPathinfo (also for SetPath/SetFile) data buffer formats */ /******************************************************************************/ typedef struct { /* data block encoding of response to level 263 QPathInfo */ - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le32 Attributes; + struct_group(common_attributes, + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + ); __u32 Pad1; __le64 AllocationSize; __le64 EndOfFile; /* size ie offset to first free byte in file */ @@ -2570,7 +2574,7 @@ typedef struct { struct win_dev { - unsigned char type[8]; /* IntxCHR or IntxBLK */ + unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/ __le64 major; __le64 minor; } __attribute__((packed)); diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 1d71d658e167..46feaa0880bd 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -81,7 +81,7 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx, extern char *build_wildcard_path_from_dentry(struct dentry *direntry); char *cifs_build_devname(char *nodename, const char *prepath); extern void delete_mid(struct mid_q_entry *mid); -extern void release_mid(struct mid_q_entry *mid); +void __release_mid(struct kref *refcount); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); @@ -132,6 +132,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *in_buf, struct smb_hdr *out_buf, int *bytes_returned); + void cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, bool all_channels); @@ -207,6 +208,9 @@ extern struct inode *cifs_iget(struct super_block *sb, int cifs_get_inode_info(struct inode **inode, const char *full_path, struct cifs_open_info_data *data, struct super_block *sb, int xid, const struct cifs_fid *fid); +bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + struct cifs_open_info_data *data); extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_get_inode_info_unix(struct inode **pinode, @@ -295,11 +299,7 @@ extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); extern void cifs_put_tcon(struct cifs_tcon *tcon); -#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) -extern void cifs_dfs_release_automount_timer(void); -#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ -#define cifs_dfs_release_automount_timer() do { } while (0) -#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */ +extern void cifs_release_automount_timer(void); void cifs_proc_init(void); void cifs_proc_clean(void); @@ -458,6 +458,12 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **syminfo, const struct nls_table *nls_codepage, int remap); +extern int cifs_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype); extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char **symlinkinfo, const struct nls_table *nls_codepage); @@ -513,7 +519,7 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses); extern struct cifs_ses *sesInfoAlloc(void); extern void sesInfoFree(struct cifs_ses *); -extern struct cifs_tcon *tconInfoAlloc(void); +extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled); extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, @@ -611,13 +617,13 @@ void cifs_free_hash(struct shash_desc **sdesc); struct cifs_chan * cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server); -int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses); +int cifs_try_adding_channels(struct cifs_ses *ses); bool is_server_using_iface(struct TCP_Server_Info *server, struct cifs_server_iface *iface); bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void cifs_ses_mark_for_reconnect(struct cifs_ses *ses); -unsigned int +int cifs_ses_get_chan_index(struct cifs_ses *ses, struct TCP_Server_Info *server); void @@ -641,6 +647,8 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses, bool cifs_chan_is_iface_active(struct cifs_ses *ses, struct TCP_Server_Info *server); +void +cifs_disable_secondary_channels(struct cifs_ses *ses); int cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server); int @@ -657,6 +665,12 @@ void cifs_put_tcp_super(struct super_block *sb); int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix); char *extract_hostname(const char *unc); char *extract_sharename(const char *unc); +int parse_reparse_point(struct reparse_data_buffer *buf, + u32 plen, struct cifs_sb_info *cifs_sb, + bool unicode, struct cifs_open_info_data *data); +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, @@ -741,4 +755,9 @@ static inline bool dfs_src_pathname_equal(const char *s1, const char *s2) return true; } +static inline void release_mid(struct mid_q_entry *mid) +{ + kref_put(&mid->refcount, __release_mid); +} + #endif /* _CIFSPROTO_H */ diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 25503f1a4fd2..9ee348e6d106 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1244,8 +1244,10 @@ openRetry: *oplock |= CIFS_CREATE_ACTION; if (buf) { - /* copy from CreationTime to Attributes */ - memcpy((char *)buf, (char *)&rsp->CreationTime, 36); + /* copy commonly used attributes */ + memcpy(&buf->common_attributes, + &rsp->common_attributes, + sizeof(buf->common_attributes)); /* the file_info buf is endian converted by caller */ buf->AllocationSize = rsp->AllocationSize; buf->EndOfFile = rsp->EndOfFile; @@ -2690,136 +2692,97 @@ querySymLinkRetry: return rc; } -/* - * Recent Windows versions now create symlinks more frequently - * and they use the "reparse point" mechanism below. We can of course - * do symlinks nicely to Samba and other servers which support the - * CIFS Unix Extensions and we can also do SFU symlinks and "client only" - * "MF" symlinks optionally, but for recent Windows we really need to - * reenable the code below and fix the cifs_symlink callers to handle this. - * In the interim this code has been moved to its own config option so - * it is not compiled in by default until callers fixed up and more tested. - */ -int -CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, - __u16 fid, char **symlinkinfo, - const struct nls_table *nls_codepage) +int cifs_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype) { - int rc = 0; - int bytes_returned; - struct smb_com_transaction_ioctl_req *pSMB; - struct smb_com_transaction_ioctl_rsp *pSMBr; - bool is_unicode; - unsigned int sub_len; - char *sub_start; - struct reparse_symlink_data *reparse_buf; - struct reparse_posix_data *posix_buf; + struct cifs_open_parms oparms; + TRANSACT_IOCTL_REQ *io_req = NULL; + TRANSACT_IOCTL_RSP *io_rsp = NULL; + struct cifs_fid fid; __u32 data_offset, data_count; - char *end_of_smb; + __u8 *start, *end; + int io_rsp_len; + int oplock = 0; + int rc; - cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid); - rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, - (void **) &pSMBr); + cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path); + + if (cap_unix(tcon->ses)) + return -EOPNOTSUPP; + + oparms = (struct cifs_open_parms) { + .tcon = tcon, + .cifs_sb = cifs_sb, + .desired_access = FILE_READ_ATTRIBUTES, + .create_options = cifs_create_options(cifs_sb, + OPEN_REPARSE_POINT), + .disposition = FILE_OPEN, + .path = full_path, + .fid = &fid, + }; + + rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) return rc; - pSMB->TotalParameterCount = 0 ; - pSMB->TotalDataCount = 0; - pSMB->MaxParameterCount = cpu_to_le32(2); - /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); - pSMB->MaxSetupCount = 4; - pSMB->Reserved = 0; - pSMB->ParameterOffset = 0; - pSMB->DataCount = 0; - pSMB->DataOffset = 0; - pSMB->SetupCount = 4; - pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); - pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT); - pSMB->IsFsctl = 1; /* FSCTL */ - pSMB->IsRootFlag = 0; - pSMB->Fid = fid; /* file handle always le */ - pSMB->ByteCount = 0; + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, + (void **)&io_req, (void **)&io_rsp); + if (rc) + goto error; - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) { - cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); - goto qreparse_out; - } + io_req->TotalParameterCount = 0; + io_req->TotalDataCount = 0; + io_req->MaxParameterCount = cpu_to_le32(2); + /* BB find exact data count max from sess structure BB */ + io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); + io_req->MaxSetupCount = 4; + io_req->Reserved = 0; + io_req->ParameterOffset = 0; + io_req->DataCount = 0; + io_req->DataOffset = 0; + io_req->SetupCount = 4; + io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); + io_req->ParameterCount = io_req->TotalParameterCount; + io_req->FunctionCode = cpu_to_le32(FSCTL_GET_REPARSE_POINT); + io_req->IsFsctl = 1; + io_req->IsRootFlag = 0; + io_req->Fid = fid.netfid; + io_req->ByteCount = 0; + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)io_req, + (struct smb_hdr *)io_rsp, &io_rsp_len, 0); + if (rc) + goto error; - data_offset = le32_to_cpu(pSMBr->DataOffset); - data_count = le32_to_cpu(pSMBr->DataCount); - if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { - /* BB also check enough total bytes returned */ - rc = -EIO; /* bad smb */ - goto qreparse_out; - } - if (!data_count || (data_count > 2048)) { + data_offset = le32_to_cpu(io_rsp->DataOffset); + data_count = le32_to_cpu(io_rsp->DataCount); + if (get_bcc(&io_rsp->hdr) < 2 || data_offset > 512 || + !data_count || data_count > 2048) { rc = -EIO; - cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); - goto qreparse_out; - } - end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; - reparse_buf = (struct reparse_symlink_data *) - ((char *)&pSMBr->hdr.Protocol + data_offset); - if ((char *)reparse_buf >= end_of_smb) { - rc = -EIO; - goto qreparse_out; - } - if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) { - cifs_dbg(FYI, "NFS style reparse tag\n"); - posix_buf = (struct reparse_posix_data *)reparse_buf; - - if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) { - cifs_dbg(FYI, "unsupported file type 0x%llx\n", - le64_to_cpu(posix_buf->InodeType)); - rc = -EOPNOTSUPP; - goto qreparse_out; - } - is_unicode = true; - sub_len = le16_to_cpu(reparse_buf->ReparseDataLength); - if (posix_buf->PathBuffer + sub_len > end_of_smb) { - cifs_dbg(FYI, "reparse buf beyond SMB\n"); - rc = -EIO; - goto qreparse_out; - } - *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer, - sub_len, is_unicode, nls_codepage); - goto qreparse_out; - } else if (reparse_buf->ReparseTag != - cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) { - rc = -EOPNOTSUPP; - goto qreparse_out; + goto error; } - /* Reparse tag is NTFS symlink */ - sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) + - reparse_buf->PathBuffer; - sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength); - if (sub_start + sub_len > end_of_smb) { - cifs_dbg(FYI, "reparse buf beyond SMB\n"); + end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount; + start = (__u8 *)&io_rsp->hdr.Protocol + data_offset; + if (start >= end) { rc = -EIO; - goto qreparse_out; + goto error; } - if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) - is_unicode = true; - else - is_unicode = false; - - /* BB FIXME investigate remapping reserved chars here */ - *symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode, - nls_codepage); - if (!*symlinkinfo) - rc = -ENOMEM; -qreparse_out: - cifs_buf_release(pSMB); - /* - * Note: On -EAGAIN error only caller can retry on handle based calls - * since file handle passed in no longer valid. - */ + *tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag); + rsp->iov_base = io_rsp; + rsp->iov_len = io_rsp_len; + *rsp_buftype = CIFS_LARGE_BUFFER; + CIFSSMBClose(xid, tcon, fid.netfid); + return 0; + +error: + cifs_buf_release(io_req); + CIFSSMBClose(xid, tcon, fid.netfid); return rc; } diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 238538dde4e3..dd2a1fb65e71 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -119,6 +119,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server) static void smb2_query_server_interfaces(struct work_struct *work) { int rc; + int xid; struct cifs_tcon *tcon = container_of(work, struct cifs_tcon, query_interfaces.work); @@ -126,8 +127,14 @@ static void smb2_query_server_interfaces(struct work_struct *work) /* * query server network interfaces, in case they change */ - rc = SMB3_request_interfaces(0, tcon, false); + xid = get_xid(); + rc = SMB3_request_interfaces(xid, tcon, false); + free_xid(xid); + if (rc) { + if (rc == -EOPNOTSUPP) + return; + cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", __func__, rc); } @@ -154,22 +161,27 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server, int i; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; - spin_lock(&pserver->srv_lock); + /* if we need to signal just this channel */ if (!all_channels) { - pserver->tcpStatus = CifsNeedReconnect; - spin_unlock(&pserver->srv_lock); + spin_lock(&server->srv_lock); + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsNeedReconnect; + spin_unlock(&server->srv_lock); return; } - spin_unlock(&pserver->srv_lock); spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { + if (!ses->chans[i].server) + continue; + spin_lock(&ses->chans[i].server->srv_lock); - ses->chans[i].server->tcpStatus = CifsNeedReconnect; + if (ses->chans[i].server->tcpStatus != CifsExiting) + ses->chans[i].server->tcpStatus = CifsNeedReconnect; spin_unlock(&ses->chans[i].server->srv_lock); } spin_unlock(&ses->chan_lock); @@ -202,11 +214,19 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, cifs_dbg(FYI, "%s: marking necessary sessions and tcons for reconnect\n", __func__); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) { + /* + * if channel has been marked for termination, nothing to do + * for the channel. in fact, we cannot find the channel for the + * server. So safe to exit here + */ + if (server->terminate) + break; + /* check if iface is still active */ if (!cifs_chan_is_iface_active(ses, server)) cifs_chan_update_iface(ses, server); @@ -241,6 +261,8 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, spin_lock(&tcon->tc_lock); tcon->status = TID_NEED_RECON; spin_unlock(&tcon->tc_lock); + + cancel_delayed_work(&tcon->query_interfaces); } if (ses->tcon_ipc) { ses->tcon_ipc->need_reconnect = true; @@ -453,10 +475,10 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_ static int reconnect_dfs_server(struct TCP_Server_Info *server) { - int rc = 0; - struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); struct dfs_cache_tgt_iterator *target_hint = NULL; + DFS_CACHE_TGT_LIST(tl); int num_targets = 0; + int rc = 0; /* * Determine the number of dfs targets the referral path in @cifs_sb resolves to. @@ -911,8 +933,8 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) return 0; } - -static void clean_demultiplex_info(struct TCP_Server_Info *server) +static noinline_for_stack void +clean_demultiplex_info(struct TCP_Server_Info *server) { int length; @@ -1179,7 +1201,12 @@ next_pdu: server->total_read += length; if (server->ops->next_header) { - next_offset = server->ops->next_header(buf); + if (server->ops->next_header(server, buf, &next_offset)) { + cifs_dbg(VFS, "%s: malformed response (next_offset=%u)\n", + __func__, next_offset); + cifs_reconnect(server, true); + continue; + } if (next_offset) server->pdu_size = next_offset; } @@ -1551,7 +1578,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx) * Skip ses channels since they're only handled in lower layers * (e.g. cifs_send_recv). */ - if (CIFS_SERVER_IS_CHAN(server) || + if (SERVER_IS_CHAN(server) || !match_server(server, ctx, false)) { spin_unlock(&server->srv_lock); continue; @@ -1586,10 +1613,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) list_del_init(&server->tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); - /* For secondary channels, we pick up ref-count on the primary server */ - if (CIFS_SERVER_IS_CHAN(server)) - cifs_put_tcp_session(server->primary_server, from_reconnect); - cancel_delayed_work_sync(&server->echo); if (from_reconnect) @@ -1603,6 +1626,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) else cancel_delayed_work_sync(&server->reconnect); + /* For secondary channels, we pick up ref-count on the primary server */ + if (SERVER_IS_CHAN(server)) + cifs_put_tcp_session(server->primary_server, from_reconnect); + spin_lock(&server->srv_lock); server->tcpStatus = CifsExiting; spin_unlock(&server->srv_lock); @@ -1686,6 +1713,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; + tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */ tcp_ses->reconnect_instance = 1; tcp_ses->lstrp = jiffies; tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression); @@ -1792,7 +1820,7 @@ out_err_crypto_release: out_err: if (tcp_ses) { - if (CIFS_SERVER_IS_CHAN(tcp_ses)) + if (SERVER_IS_CHAN(tcp_ses)) cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); @@ -1881,7 +1909,8 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) } } - tcon = tconInfoAlloc(); + /* no need to setup directory caching on IPC share, so pass in false */ + tcon = tcon_info_alloc(false); if (tcon == NULL) return -ENOMEM; @@ -1967,9 +1996,10 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) void __cifs_put_smb_ses(struct cifs_ses *ses) { - unsigned int rc, xid; - unsigned int chan_count; struct TCP_Server_Info *server = ses->server; + unsigned int xid; + size_t i; + int rc; spin_lock(&ses->ses_lock); if (ses->ses_status == SES_EXITING) { @@ -2015,20 +2045,20 @@ void __cifs_put_smb_ses(struct cifs_ses *ses) list_del_init(&ses->smb_ses_list); spin_unlock(&cifs_tcp_ses_lock); - chan_count = ses->chan_count; - /* close any extra channels */ - if (chan_count > 1) { - int i; - - for (i = 1; i < chan_count; i++) { - if (ses->chans[i].iface) { - kref_put(&ses->chans[i].iface->refcount, release_iface); - ses->chans[i].iface = NULL; - } - cifs_put_tcp_session(ses->chans[i].server, 0); - ses->chans[i].server = NULL; + for (i = 1; i < ses->chan_count; i++) { + if (ses->chans[i].iface) { + kref_put(&ses->chans[i].iface->refcount, release_iface); + ses->chans[i].iface = NULL; } + cifs_put_tcp_session(ses->chans[i].server, 0); + ses->chans[i].server = NULL; + } + + /* we now account for primary channel in iface->refcount */ + if (ses->chans[0].iface) { + kref_put(&ses->chans[0].iface->refcount, release_iface); + ses->chans[0].server = NULL; } sesInfoFree(ses); @@ -2472,8 +2502,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) static struct cifs_tcon * cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) { - int rc, xid; struct cifs_tcon *tcon; + bool nohandlecache; + int rc, xid; tcon = cifs_find_tcon(ses, ctx); if (tcon) { @@ -2491,11 +2522,17 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } - tcon = tconInfoAlloc(); + if (ses->server->dialect >= SMB20_PROT_ID && + (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)) + nohandlecache = ctx->nohandlecache; + else + nohandlecache = true; + tcon = tcon_info_alloc(!nohandlecache); if (tcon == NULL) { rc = -ENOMEM; goto out_fail; } + tcon->nohandlecache = nohandlecache; if (ctx->snapshot_time) { if (ses->server->vals->protocol_id == 0) { @@ -2656,10 +2693,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->retry = ctx->retry; tcon->nocase = ctx->nocase; tcon->broken_sparse_sup = ctx->no_sparse; - if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) - tcon->nohandlecache = ctx->nohandlecache; - else - tcon->nohandlecache = true; + tcon->max_cached_dirs = ctx->max_cached_dirs; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -2889,9 +2923,9 @@ bind_socket(struct TCP_Server_Info *server) if (server->srcaddr.ss_family != AF_UNSPEC) { /* Bind to the specified local IP address */ struct socket *socket = server->ssocket; - rc = socket->ops->bind(socket, - (struct sockaddr *) &server->srcaddr, - sizeof(server->srcaddr)); + rc = kernel_bind(socket, + (struct sockaddr *) &server->srcaddr, + sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; struct sockaddr_in6 *saddr6; @@ -3040,8 +3074,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - rc = socket->ops->connect(socket, saddr, slen, - server->noblockcnt ? O_NONBLOCK : 0); + rc = kernel_connect(socket, saddr, slen, + server->noblockcnt ? O_NONBLOCK : 0); /* * When mounting SMB root file systems, we do not want to block in * connect. Otherwise bail out and then let cifs_reconnect() perform @@ -3554,7 +3588,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) ctx->prepath = NULL; out: - cifs_try_adding_channels(cifs_sb, mnt_ctx.ses); + cifs_try_adding_channels(mnt_ctx.ses); rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon); if (rc) goto error; @@ -3813,7 +3847,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct nls_table *nls_info) { int rc = -ENOSYS; - struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + struct TCP_Server_Info *pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; bool is_binding = false; @@ -3843,8 +3877,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); spin_unlock(&ses->chan_lock); - if (!is_binding) + if (!is_binding) { ses->ses_status = SES_IN_SETUP; + + /* force iface_list refresh */ + ses->iface_last_update = 0; + } spin_unlock(&ses->ses_lock); /* update ses ip_addr only for primary chan */ diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index ee772c3d9f00..a8a1d386da65 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -3,7 +3,6 @@ * Copyright (c) 2022 Paulo Alcantara <palcantara@suse.de> */ -#include <linux/namei.h> #include "cifsproto.h" #include "cifs_debug.h" #include "dns_resolve.h" @@ -96,51 +95,134 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) return 0; } -static int get_dfs_conn(struct cifs_mount_ctx *mnt_ctx, const char *ref_path, const char *full_path, - const struct dfs_cache_tgt_iterator *tit) +static inline int parse_dfs_target(struct smb3_fs_context *ctx, + struct dfs_ref_walk *rw, + struct dfs_info3_param *tgt) +{ + int rc; + const char *fpath = ref_walk_fpath(rw) + 1; + + rc = ref_walk_get_tgt(rw, tgt); + if (!rc) + rc = dfs_parse_target_referral(fpath, tgt, ctx); + return rc; +} + +static int set_ref_paths(struct cifs_mount_ctx *mnt_ctx, + struct dfs_info3_param *tgt, + struct dfs_ref_walk *rw) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - struct dfs_info3_param ref = {}; - bool is_refsrv; - int rc, rc2; + struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; + char *ref_path, *full_path; + int rc; - rc = dfs_cache_get_tgt_referral(ref_path + 1, tit, &ref); - if (rc) + full_path = smb3_fs_context_fullpath(ctx, CIFS_DIR_SEP(cifs_sb)); + if (IS_ERR(full_path)) + return PTR_ERR(full_path); + + if (!tgt || (tgt->server_type == DFS_TYPE_LINK && + DFS_INTERLINK(tgt->flags))) + ref_path = dfs_get_path(cifs_sb, ctx->UNC); + else + ref_path = dfs_get_path(cifs_sb, full_path); + if (IS_ERR(ref_path)) { + rc = PTR_ERR(ref_path); + kfree(full_path); return rc; + } + ref_walk_path(rw) = ref_path; + ref_walk_fpath(rw) = full_path; + return 0; +} - rc = dfs_parse_target_referral(full_path + 1, &ref, ctx); - if (rc) - goto out; +static int __dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx, + struct dfs_ref_walk *rw) +{ + struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; + struct dfs_info3_param tgt = {}; + bool is_refsrv; + int rc = -ENOENT; - cifs_mount_put_conns(mnt_ctx); - rc = get_session(mnt_ctx, ref_path); - if (rc) - goto out; +again: + do { + if (ref_walk_empty(rw)) { + rc = dfs_get_referral(mnt_ctx, ref_walk_path(rw) + 1, + NULL, ref_walk_tl(rw)); + if (rc) { + rc = cifs_mount_get_tcon(mnt_ctx); + if (!rc) + rc = cifs_is_path_remote(mnt_ctx); + continue; + } + if (!ref_walk_num_tgts(rw)) { + rc = -ENOENT; + continue; + } + } - is_refsrv = !!(ref.flags & DFSREF_REFERRAL_SERVER); + while (ref_walk_next_tgt(rw)) { + rc = parse_dfs_target(ctx, rw, &tgt); + if (rc) + continue; - rc = -EREMOTE; - if (ref.flags & DFSREF_STORAGE_SERVER) { - rc = cifs_mount_get_tcon(mnt_ctx); - if (rc) - goto out; + cifs_mount_put_conns(mnt_ctx); + rc = get_session(mnt_ctx, ref_walk_path(rw)); + if (rc) + continue; - /* some servers may not advertise referral capability under ref.flags */ - is_refsrv |= is_tcon_dfs(mnt_ctx->tcon); + is_refsrv = tgt.server_type == DFS_TYPE_ROOT || + DFS_INTERLINK(tgt.flags); + ref_walk_set_tgt_hint(rw); - rc = cifs_is_path_remote(mnt_ctx); - } + if (tgt.flags & DFSREF_STORAGE_SERVER) { + rc = cifs_mount_get_tcon(mnt_ctx); + if (!rc) + rc = cifs_is_path_remote(mnt_ctx); + if (!rc) + break; + if (rc != -EREMOTE) + continue; + } - dfs_cache_noreq_update_tgthint(ref_path + 1, tit); + if (is_refsrv) { + rc = add_root_smb_session(mnt_ctx); + if (rc) + goto out; + } - if (rc == -EREMOTE && is_refsrv) { - rc2 = add_root_smb_session(mnt_ctx); - if (rc2) - rc = rc2; - } + rc = ref_walk_advance(rw); + if (!rc) { + rc = set_ref_paths(mnt_ctx, &tgt, rw); + if (!rc) { + rc = -EREMOTE; + goto again; + } + } + if (rc != -ELOOP) + goto out; + } + } while (rc && ref_walk_descend(rw)); out: - free_dfs_info_param(&ref); + free_dfs_info_param(&tgt); + return rc; +} + +static int dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx) +{ + struct dfs_ref_walk *rw; + int rc; + + rw = ref_walk_alloc(); + if (IS_ERR(rw)) + return PTR_ERR(rw); + + ref_walk_init(rw); + rc = set_ref_paths(mnt_ctx, NULL, rw); + if (!rc) + rc = __dfs_referral_walk(mnt_ctx, rw); + ref_walk_free(rw); return rc; } @@ -148,105 +230,56 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) { struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb; struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - char *ref_path = NULL, *full_path = NULL; - struct dfs_cache_tgt_iterator *tit; struct cifs_tcon *tcon; - char *origin_fullpath = NULL; - char sep = CIFS_DIR_SEP(cifs_sb); - int num_links = 0; + char *origin_fullpath; int rc; - ref_path = dfs_get_path(cifs_sb, ctx->UNC); - if (IS_ERR(ref_path)) - return PTR_ERR(ref_path); + origin_fullpath = dfs_get_path(cifs_sb, ctx->source); + if (IS_ERR(origin_fullpath)) + return PTR_ERR(origin_fullpath); - full_path = smb3_fs_context_fullpath(ctx, sep); - if (IS_ERR(full_path)) { - rc = PTR_ERR(full_path); - full_path = NULL; + rc = dfs_referral_walk(mnt_ctx); + if (rc) goto out; - } - origin_fullpath = kstrdup(full_path, GFP_KERNEL); - if (!origin_fullpath) { - rc = -ENOMEM; - goto out; + tcon = mnt_ctx->tcon; + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + tcon->origin_fullpath = origin_fullpath; + origin_fullpath = NULL; } + spin_unlock(&tcon->tc_lock); - do { - struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); - - rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl); - if (rc) { - rc = cifs_mount_get_tcon(mnt_ctx); - if (!rc) - rc = cifs_is_path_remote(mnt_ctx); - break; - } - - tit = dfs_cache_get_tgt_iterator(&tl); - if (!tit) { - cifs_dbg(VFS, "%s: dfs referral (%s) with no targets\n", __func__, - ref_path + 1); - rc = -ENOENT; - dfs_cache_free_tgts(&tl); - break; - } - - do { - rc = get_dfs_conn(mnt_ctx, ref_path, full_path, tit); - if (!rc) - break; - if (rc == -EREMOTE) { - if (++num_links > MAX_NESTED_LINKS) { - rc = -ELOOP; - break; - } - kfree(ref_path); - kfree(full_path); - ref_path = full_path = NULL; - - full_path = smb3_fs_context_fullpath(ctx, sep); - if (IS_ERR(full_path)) { - rc = PTR_ERR(full_path); - full_path = NULL; - } else { - ref_path = dfs_get_path(cifs_sb, full_path); - if (IS_ERR(ref_path)) { - rc = PTR_ERR(ref_path); - ref_path = NULL; - } - } - break; - } - } while ((tit = dfs_cache_get_next_tgt(&tl, tit))); - dfs_cache_free_tgts(&tl); - } while (rc == -EREMOTE); - - if (!rc) { - tcon = mnt_ctx->tcon; - - spin_lock(&tcon->tc_lock); - if (!tcon->origin_fullpath) { - tcon->origin_fullpath = origin_fullpath; - origin_fullpath = NULL; - } - spin_unlock(&tcon->tc_lock); - - if (list_empty(&tcon->dfs_ses_list)) { - list_replace_init(&mnt_ctx->dfs_ses_list, - &tcon->dfs_ses_list); - queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, - dfs_cache_get_ttl() * HZ); - } else { - dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list); - } + if (list_empty(&tcon->dfs_ses_list)) { + list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list); + queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work, + dfs_cache_get_ttl() * HZ); + } else { + dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list); } out: kfree(origin_fullpath); - kfree(ref_path); - kfree(full_path); + return rc; +} + +/* + * If @ctx->dfs_automount, then update @ctx->dstaddr earlier with the DFS root + * server from where we'll start following any referrals. Otherwise rely on the + * value provided by mount(2) as the user might not have dns_resolver key set up + * and therefore failing to upcall to resolve UNC hostname under @ctx->source. + */ +static int update_fs_context_dstaddr(struct smb3_fs_context *ctx) +{ + struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; + int rc = 0; + + if (!ctx->nodfs && ctx->dfs_automount) { + rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL); + if (!rc) + cifs_set_port(addr, ctx->port); + ctx->dfs_automount = false; + } return rc; } @@ -256,6 +289,10 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) bool nodfs = ctx->nodfs; int rc; + rc = update_fs_context_dstaddr(ctx); + if (rc) + return rc; + *isdfs = false; rc = get_session(mnt_ctx, NULL); if (rc) @@ -426,7 +463,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t /* Try to tree connect to all dfs targets */ for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) { const char *target = dfs_cache_get_tgt_name(tit); - struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl); + DFS_CACHE_TGT_LIST(ntl); kfree(share); kfree(prefix); @@ -520,7 +557,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru int rc; struct TCP_Server_Info *server = tcon->ses->server; const struct smb_version_operations *ops = server->ops; - struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); + DFS_CACHE_TGT_LIST(tl); struct cifs_sb_info *cifs_sb = NULL; struct super_block *sb = NULL; struct dfs_info3_param ref = {0}; diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index 98e9d2aca6a7..875ab7ae57fc 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -9,6 +9,110 @@ #include "cifsglob.h" #include "fs_context.h" #include "cifs_unicode.h" +#include <linux/namei.h> + +#define DFS_INTERLINK(v) \ + (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER)) + +struct dfs_ref { + char *path; + char *full_path; + struct dfs_cache_tgt_list tl; + struct dfs_cache_tgt_iterator *tit; +}; + +struct dfs_ref_walk { + struct dfs_ref *ref; + struct dfs_ref refs[MAX_NESTED_LINKS]; +}; + +#define ref_walk_start(w) ((w)->refs) +#define ref_walk_end(w) (&(w)->refs[ARRAY_SIZE((w)->refs) - 1]) +#define ref_walk_cur(w) ((w)->ref) +#define ref_walk_descend(w) (--ref_walk_cur(w) >= ref_walk_start(w)) + +#define ref_walk_tit(w) (ref_walk_cur(w)->tit) +#define ref_walk_empty(w) (!ref_walk_tit(w)) +#define ref_walk_path(w) (ref_walk_cur(w)->path) +#define ref_walk_fpath(w) (ref_walk_cur(w)->full_path) +#define ref_walk_tl(w) (&ref_walk_cur(w)->tl) + +static inline struct dfs_ref_walk *ref_walk_alloc(void) +{ + struct dfs_ref_walk *rw; + + rw = kmalloc(sizeof(*rw), GFP_KERNEL); + if (!rw) + return ERR_PTR(-ENOMEM); + return rw; +} + +static inline void ref_walk_init(struct dfs_ref_walk *rw) +{ + memset(rw, 0, sizeof(*rw)); + ref_walk_cur(rw) = ref_walk_start(rw); +} + +static inline void __ref_walk_free(struct dfs_ref *ref) +{ + kfree(ref->path); + kfree(ref->full_path); + dfs_cache_free_tgts(&ref->tl); + memset(ref, 0, sizeof(*ref)); +} + +static inline void ref_walk_free(struct dfs_ref_walk *rw) +{ + struct dfs_ref *ref = ref_walk_start(rw); + + for (; ref <= ref_walk_end(rw); ref++) + __ref_walk_free(ref); + kfree(rw); +} + +static inline int ref_walk_advance(struct dfs_ref_walk *rw) +{ + struct dfs_ref *ref = ref_walk_cur(rw) + 1; + + if (ref > ref_walk_end(rw)) + return -ELOOP; + __ref_walk_free(ref); + ref_walk_cur(rw) = ref; + return 0; +} + +static inline struct dfs_cache_tgt_iterator * +ref_walk_next_tgt(struct dfs_ref_walk *rw) +{ + struct dfs_cache_tgt_iterator *tit; + struct dfs_ref *ref = ref_walk_cur(rw); + + if (!ref->tit) + tit = dfs_cache_get_tgt_iterator(&ref->tl); + else + tit = dfs_cache_get_next_tgt(&ref->tl, ref->tit); + ref->tit = tit; + return tit; +} + +static inline int ref_walk_get_tgt(struct dfs_ref_walk *rw, + struct dfs_info3_param *tgt) +{ + zfree_dfs_info_param(tgt); + return dfs_cache_get_tgt_referral(ref_walk_path(rw) + 1, + ref_walk_tit(rw), tgt); +} + +static inline int ref_walk_num_tgts(struct dfs_ref_walk *rw) +{ + return dfs_cache_get_nr_tgts(ref_walk_tl(rw)); +} + +static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw) +{ + dfs_cache_noreq_update_tgthint(ref_walk_path(rw) + 1, + ref_walk_tit(rw)); +} struct dfs_root_ses { struct list_head list; @@ -34,43 +138,6 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p cifs_remap(cifs_sb), path, ref, tl); } -/* Return DFS full path out of a dentry set for automount */ -static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page) -{ - struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - size_t len; - char *s; - - spin_lock(&tcon->tc_lock); - if (unlikely(!tcon->origin_fullpath)) { - spin_unlock(&tcon->tc_lock); - return ERR_PTR(-EREMOTE); - } - spin_unlock(&tcon->tc_lock); - - s = dentry_path_raw(dentry, page, PATH_MAX); - if (IS_ERR(s)) - return s; - /* for root, we want "" */ - if (!s[1]) - s++; - - spin_lock(&tcon->tc_lock); - len = strlen(tcon->origin_fullpath); - if (s < (char *)page + len) { - spin_unlock(&tcon->tc_lock); - return ERR_PTR(-ENAMETOOLONG); - } - - s -= len; - memcpy(s, tcon->origin_fullpath, len); - spin_unlock(&tcon->tc_lock); - convert_delimiter(s, '/'); - - return s; -} - static inline void dfs_put_root_smb_sessions(struct list_head *head) { struct dfs_root_ses *root, *tmp; diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 33adf43a01f1..508d831fabe3 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -29,8 +29,6 @@ #define CACHE_MIN_TTL 120 /* 2 minutes */ #define CACHE_DEFAULT_TTL 300 /* 5 minutes */ -#define IS_DFS_INTERLINK(v) (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER)) - struct cache_dfs_tgt { char *name; int path_consumed; @@ -174,7 +172,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v) "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", + DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); list_for_each_entry(t, &ce->tlist, list) { @@ -243,7 +241,7 @@ static inline void dump_ce(const struct cache_entry *ce) ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", + DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); @@ -1177,9 +1175,9 @@ static bool is_ses_good(struct cifs_ses *ses) /* Refresh dfs referral of tcon and mark it for reconnect if needed */ static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh) { - struct dfs_cache_tgt_list old_tl = DFS_CACHE_TGT_LIST_INIT(old_tl); - struct dfs_cache_tgt_list new_tl = DFS_CACHE_TGT_LIST_INIT(new_tl); struct TCP_Server_Info *server = ses->server; + DFS_CACHE_TGT_LIST(old_tl); + DFS_CACHE_TGT_LIST(new_tl); bool needs_refresh = false; struct cache_entry *ce; unsigned int xid; diff --git a/fs/smb/client/dfs_cache.h b/fs/smb/client/dfs_cache.h index c6d89cd6d4fd..18a08a2ca93b 100644 --- a/fs/smb/client/dfs_cache.h +++ b/fs/smb/client/dfs_cache.h @@ -16,7 +16,11 @@ extern struct workqueue_struct *dfscache_wq; extern atomic_t dfs_cache_ttl; -#define DFS_CACHE_TGT_LIST_INIT(var) { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), } +#define DFS_CACHE_TGT_LIST_INIT(var) \ + { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), } + +#define DFS_CACHE_TGT_LIST(var) \ + struct dfs_cache_tgt_list var = DFS_CACHE_TGT_LIST_INIT(var) struct dfs_cache_tgt_list { int tl_numtgts; @@ -51,8 +55,8 @@ static inline struct dfs_cache_tgt_iterator * dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl, struct dfs_cache_tgt_iterator *it) { - if (!tl || list_empty(&tl->tl_list) || !it || - list_is_last(&it->it_list, &tl->tl_list)) + if (!tl || !tl->tl_numtgts || list_empty(&tl->tl_list) || + !it || list_is_last(&it->it_list, &tl->tl_list)) return NULL; return list_next_entry(it, it_list); } @@ -71,7 +75,7 @@ static inline void dfs_cache_free_tgts(struct dfs_cache_tgt_list *tl) { struct dfs_cache_tgt_iterator *it, *nit; - if (!tl || list_empty(&tl->tl_list)) + if (!tl || !tl->tl_numtgts || list_empty(&tl->tl_list)) return; list_for_each_entry_safe(it, nit, &tl->tl_list, it_list) { list_del(&it->it_list); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 30b1e1bfd204..580a27a3a7e6 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -797,7 +797,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) const struct dentry_operations cifs_dentry_ops = { .d_revalidate = cifs_d_revalidate, - .d_automount = cifs_dfs_d_automount, + .d_automount = cifs_d_automount, /* d_delete: cifs_d_delete, */ /* not needed except for debugging */ }; @@ -872,5 +872,5 @@ const struct dentry_operations cifs_ci_dentry_ops = { .d_revalidate = cifs_d_revalidate, .d_hash = cifs_ci_hash, .d_compare = cifs_ci_compare, - .d_automount = cifs_dfs_d_automount, + .d_automount = cifs_d_automount, }; diff --git a/fs/smb/client/export.c b/fs/smb/client/export.c index 37c28415df1e..d606e8cbcb7d 100644 --- a/fs/smb/client/export.c +++ b/fs/smb/client/export.c @@ -41,13 +41,12 @@ static struct dentry *cifs_get_parent(struct dentry *dentry) } const struct export_operations cifs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .get_parent = cifs_get_parent, -/* Following five export operations are unneeded so far and can default: - .get_dentry = - .get_name = - .find_exported_dentry = - .decode_fh = - .encode_fs = */ +/* + * Following export operations are mandatory for NFS export support: + * .fh_to_dentry = + */ }; #endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 6bc44f79d2e9..32a8525415d9 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1085,7 +1085,8 @@ int cifs_close(struct inode *inode, struct file *file) !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); @@ -2596,7 +2597,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) write_data, to - from, &offset); cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ - inode->i_atime = inode->i_mtime = current_time(inode); + simple_inode_init_ts(inode); if ((bytes_written > 0) && (offset)) rc = 0; else if (bytes_written < 0) @@ -4647,11 +4648,13 @@ static void cifs_readahead(struct readahead_control *ractl) static int cifs_readpage_worker(struct file *file, struct page *page, loff_t *poffset) { + struct inode *inode = file_inode(file); + struct timespec64 atime, mtime; char *read_data; int rc; /* Is the page cached? */ - rc = cifs_readpage_from_fscache(file_inode(file), page); + rc = cifs_readpage_from_fscache(inode, page); if (rc == 0) goto read_complete; @@ -4666,11 +4669,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page, cifs_dbg(FYI, "Bytes read %d\n", rc); /* we do not want atime to be less than mtime, it broke some apps */ - file_inode(file)->i_atime = current_time(file_inode(file)); - if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime))) - file_inode(file)->i_atime = file_inode(file)->i_mtime; - else - file_inode(file)->i_atime = current_time(file_inode(file)); + atime = inode_set_atime_to_ts(inode, current_time(inode)); + mtime = inode_get_mtime(inode); + if (timespec64_compare(&atime, &mtime) < 0) + inode_set_atime_to_ts(inode, inode_get_mtime(inode)); if (PAGE_SIZE > rc) memset(read_data + rc, 0, PAGE_SIZE - rc); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 67e16c2ac90e..a3493da12ad1 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -150,6 +150,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("closetimeo", Opt_closetimeo), fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), + fsparam_u32("max_cached_dirs", Opt_max_cached_dirs), fsparam_u32("handletimeout", Opt_handletimeout), fsparam_u64("snapshot", Opt_snapshot), fsparam_u32("max_channels", Opt_max_channels), @@ -1165,6 +1166,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (result.uint_32 > 1) ctx->multichannel = true; break; + case Opt_max_cached_dirs: + if (result.uint_32 < 1) { + cifs_errorf(fc, "%s: Invalid max_cached_dirs, needs to be 1 or more\n", + __func__); + goto cifs_parse_mount_err; + } + ctx->max_cached_dirs = result.uint_32; + break; case Opt_handletimeout: ctx->handle_timeout = result.uint_32; if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { @@ -1532,6 +1541,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_parse_mount_err: kfree_sensitive(ctx->password); + ctx->password = NULL; return -EINVAL; } @@ -1592,7 +1602,7 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->acregmax = CIFS_DEF_ACTIMEO; ctx->acdirmax = CIFS_DEF_ACTIMEO; ctx->closetimeo = SMB3_DEF_DCLOSETIMEO; - + ctx->max_cached_dirs = MAX_CACHED_FIDS; /* Most clients set timeout to 0, allows server to use its default */ ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index f4eaf8558902..cf46916286d0 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -128,6 +128,7 @@ enum cifs_param { Opt_closetimeo, Opt_echo_interval, Opt_max_credits, + Opt_max_cached_dirs, Opt_snapshot, Opt_max_channels, Opt_handletimeout, @@ -261,11 +262,13 @@ struct smb3_fs_context { __u32 handle_timeout; /* persistent and durable handle timeout in ms */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ unsigned int max_channels; + unsigned int max_cached_dirs; __u16 compression; /* compression algorithm 0xFFFF default 0=disabled */ bool rootfs:1; /* if it's a SMB root file system */ bool witness:1; /* use witness protocol */ char *leaf_fullpath; struct cifs_ses *dfs_root_ses; + bool dfs_automount:1; /* set for dfs automount only */ }; extern const struct fs_parameter_spec smb3_fs_parameters[]; @@ -287,7 +290,7 @@ extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb); */ #define SMB3_MAX_DCLOSETIMEO (1 << 30) #define SMB3_DEF_DCLOSETIMEO (1 * HZ) /* even 1 sec enough to help eg open/write/close/open/read */ - +#define MAX_CACHED_FIDS 16 extern char *cifs_sanitize_prepath(char *prepath, gfp_t gfp); #endif diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index 8f6909d633da..e5cad149f5a2 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -48,7 +48,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) sharename = extract_sharename(tcon->tree_name); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); - return -EINVAL; + return PTR_ERR(sharename); } slen = strlen(sharename); @@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); + if (cifsi->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h index 173999610997..a3d73720914f 100644 --- a/fs/smb/client/fscache.h +++ b/fs/smb/client/fscache.h @@ -49,13 +49,14 @@ static inline void cifs_fscache_fill_coherency(struct inode *inode, struct cifs_fscache_inode_coherency_data *cd) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); memset(cd, 0, sizeof(*cd)); - cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); - cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); - cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec); - cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec); + cd->last_write_time_sec = cpu_to_le64(mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(mtime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec); } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index c3eeae07e139..09c5c0f5c96e 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -58,13 +58,9 @@ static void cifs_set_ops(struct inode *inode) inode->i_data.a_ops = &cifs_addr_ops; break; case S_IFDIR: -#ifdef CONFIG_CIFS_DFS_UPCALL if (IS_AUTOMOUNT(inode)) { - inode->i_op = &cifs_dfs_referral_inode_operations; + inode->i_op = &cifs_namespace_inode_operations; } else { -#else /* NO DFS support, treat as a directory */ - { -#endif inode->i_op = &cifs_dir_inode_ops; inode->i_fop = &cifs_dir_ops; } @@ -86,6 +82,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct timespec64 mtime; cifs_dbg(FYI, "%s: revalidating inode %llu\n", __func__, cifs_i->uniqueid); @@ -105,7 +102,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) /* revalidate if mtime or size have changed */ fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); - if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) && + mtime = inode_get_mtime(inode); + if (timespec64_equal(&mtime, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); @@ -168,11 +166,11 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode); /* we do not want atime to be less than mtime, it broke some apps */ if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0) - inode->i_atime = fattr->cf_mtime; + inode_set_atime_to_ts(inode, fattr->cf_mtime); else - inode->i_atime = fattr->cf_atime; - inode->i_mtime = fattr->cf_mtime; - inode->i_ctime = fattr->cf_ctime; + inode_set_atime_to_ts(inode, fattr->cf_atime); + inode_set_mtime_to_ts(inode, fattr->cf_mtime); + inode_set_ctime_to_ts(inode, fattr->cf_ctime); inode->i_rdev = fattr->cf_rdev; cifs_nlink_fattr_to_inode(inode, fattr); inode->i_uid = fattr->cf_uid; @@ -218,7 +216,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) } spin_unlock(&inode->i_lock); - if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) + if (fattr->cf_flags & CIFS_FATTR_JUNCTION) inode->i_flags |= S_AUTOMOUNT; if (inode->i_state & I_NEW) cifs_set_ops(inode); @@ -327,14 +325,14 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, * * Needed to setup cifs_fattr data for the directory which is the * junction to the new submount (ie to setup the fake directory - * which represents a DFS referral). + * which represents a DFS referral or reparse mount point). */ -static void -cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) +static void cifs_create_junction_fattr(struct cifs_fattr *fattr, + struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - cifs_dbg(FYI, "creating fake fattr for DFS referral\n"); + cifs_dbg(FYI, "%s: creating fake fattr\n", __func__); memset(fattr, 0, sizeof(*fattr)); fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; @@ -343,7 +341,33 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) ktime_get_coarse_real_ts64(&fattr->cf_mtime); fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; - fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL; + fattr->cf_flags = CIFS_FATTR_JUNCTION; +} + +/* Update inode with final fattr data */ +static int update_inode_info(struct super_block *sb, + struct cifs_fattr *fattr, + struct inode **inode) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + int rc = 0; + + if (!*inode) { + *inode = cifs_iget(sb, fattr); + if (!*inode) + rc = -ENOMEM; + return rc; + } + /* We already have inode, update it. + * + * If file type or uniqueid is different, return error. + */ + if (unlikely((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + CIFS_I(*inode)->uniqueid != fattr->cf_uniqueid)) { + CIFS_I(*inode)->time = 0; /* force reval */ + return -ESTALE; + } + return cifs_fattr_to_inode(*inode, fattr); } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY @@ -373,7 +397,7 @@ cifs_get_file_info_unix(struct file *filp) if (!rc) { cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, inode->i_sb); + cifs_create_junction_fattr(&fattr, inode->i_sb); rc = 0; } else goto cifs_gfiunix_out; @@ -385,17 +409,18 @@ cifs_gfiunix_out: return rc; } -int cifs_get_inode_info_unix(struct inode **pinode, - const unsigned char *full_path, - struct super_block *sb, unsigned int xid) +static int cifs_get_unix_fattr(const unsigned char *full_path, + struct super_block *sb, + struct cifs_fattr *fattr, + struct inode **pinode, + const unsigned int xid) { - int rc; + struct TCP_Server_Info *server; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); FILE_UNIX_BASIC_INFO find_data; - struct cifs_fattr fattr; struct cifs_tcon *tcon; - struct TCP_Server_Info *server; struct tcon_link *tlink; - struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + int rc, tmprc; cifs_dbg(FYI, "Getting info on %s\n", full_path); @@ -412,59 +437,60 @@ int cifs_get_inode_info_unix(struct inode **pinode, cifs_put_tlink(tlink); if (!rc) { - cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); + cifs_unix_basic_to_fattr(fattr, &find_data, cifs_sb); } else if (rc == -EREMOTE) { - cifs_create_dfs_fattr(&fattr, sb); + cifs_create_junction_fattr(fattr, sb); rc = 0; } else { return rc; } + if (!*pinode) + cifs_fill_uniqueid(sb, fattr); + /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, - full_path); - if (tmprc) - cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } - if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) { + if (S_ISLNK(fattr->cf_mode) && !fattr->cf_symlink_target) { if (!server->ops->query_symlink) return -EOPNOTSUPP; - rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &fattr.cf_symlink_target, false); - if (rc) { - cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); - goto cgiiu_exit; - } + rc = server->ops->query_symlink(xid, tcon, + cifs_sb, full_path, + &fattr->cf_symlink_target); + cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc); } + return rc; +} - if (*pinode == NULL) { - /* get new inode */ - cifs_fill_uniqueid(sb, &fattr); - *pinode = cifs_iget(sb, &fattr); - if (!*pinode) - rc = -ENOMEM; - } else { - /* we already have inode, update it */ - - /* if uniqueid is different, return error */ - if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && - CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { - CIFS_I(*pinode)->time = 0; /* force reval */ - rc = -ESTALE; - goto cgiiu_exit; - } +int cifs_get_inode_info_unix(struct inode **pinode, + const unsigned char *full_path, + struct super_block *sb, unsigned int xid) +{ + struct cifs_fattr fattr = {}; + int rc; - /* if filetype is different, return error */ - rc = cifs_fattr_to_inode(*pinode, &fattr); - } + rc = cifs_get_unix_fattr(full_path, sb, &fattr, pinode, xid); + if (rc) + goto out; -cgiiu_exit: + rc = update_inode_info(sb, &fattr, pinode); +out: kfree(fattr.cf_symlink_target); return rc; } #else +static inline int cifs_get_unix_fattr(const unsigned char *full_path, + struct super_block *sb, + struct cifs_fattr *fattr, + struct inode **pinode, + const unsigned int xid) +{ + return -EOPNOTSUPP; +} + int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *full_path, struct super_block *sb, unsigned int xid) @@ -567,6 +593,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; + } else if (memcmp("LnxFIFO", pbuf, 8) == 0) { + cifs_dbg(FYI, "FIFO\n"); + fattr->cf_mode |= S_IFIFO; + fattr->cf_dtype = DT_FIFO; } else { fattr->cf_mode |= S_IFREG; /* file? */ fattr->cf_dtype = DT_REG; @@ -632,10 +662,11 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from POSIX info struct */ -static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, +static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, + struct cifs_open_info_data *data, struct cifs_sid *owner, struct cifs_sid *group, - struct super_block *sb, bool adjust_tz, bool symlink) + struct super_block *sb) { struct smb311_posix_qinfo *info = &data->posix_fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -655,7 +686,7 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); - if (adjust_tz) { + if (data->adjust_tz) { fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj; fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; } @@ -669,7 +700,7 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope /* The srv fs device id is overridden on network mount so setting rdev isn't needed here */ /* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */ - if (symlink) { + if (data->symlink) { fattr->cf_mode |= S_IFLNK; fattr->cf_dtype = DT_LNK; fattr->cf_symlink_target = data->symlink_target; @@ -690,9 +721,87 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink); } -static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data, - struct super_block *sb, bool adjust_tz, bool symlink, - u32 reparse_tag) +static inline dev_t nfs_mkdev(struct reparse_posix_data *buf) +{ + u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer); + + return MKDEV(v >> 32, v & 0xffffffff); +} + +bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, + struct cifs_fattr *fattr, + struct cifs_open_info_data *data) +{ + struct reparse_posix_data *buf = data->reparse.posix; + u32 tag = data->reparse.tag; + + if (tag == IO_REPARSE_TAG_NFS && buf) { + switch (le64_to_cpu(buf->InodeType)) { + case NFS_SPECFILE_CHR: + fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_CHR; + fattr->cf_rdev = nfs_mkdev(buf); + break; + case NFS_SPECFILE_BLK: + fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_BLK; + fattr->cf_rdev = nfs_mkdev(buf); + break; + case NFS_SPECFILE_FIFO: + fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_FIFO; + break; + case NFS_SPECFILE_SOCK: + fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_SOCK; + break; + case NFS_SPECFILE_LNK: + fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_LNK; + break; + default: + WARN_ON_ONCE(1); + return false; + } + return true; + } + + switch (tag) { + case IO_REPARSE_TAG_LX_SYMLINK: + fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_LNK; + break; + case IO_REPARSE_TAG_LX_FIFO: + fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_FIFO; + break; + case IO_REPARSE_TAG_AF_UNIX: + fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_SOCK; + break; + case IO_REPARSE_TAG_LX_CHR: + fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_CHR; + break; + case IO_REPARSE_TAG_LX_BLK: + fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_BLK; + break; + case 0: /* SMB1 symlink */ + case IO_REPARSE_TAG_SYMLINK: + case IO_REPARSE_TAG_NFS: + fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_LNK; + break; + default: + return false; + } + return true; +} + +static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, + struct cifs_open_info_data *data, + struct super_block *sb) { struct smb2_file_all_info *info = &data->fi; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -711,7 +820,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); - if (adjust_tz) { + if (data->adjust_tz) { fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj; fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj; } @@ -719,28 +828,13 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i fattr->cf_eof = le64_to_cpu(info->EndOfFile); fattr->cf_bytes = le64_to_cpu(info->AllocationSize); fattr->cf_createtime = le64_to_cpu(info->CreationTime); - fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); - if (reparse_tag == IO_REPARSE_TAG_LX_SYMLINK) { - fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_LNK; - } else if (reparse_tag == IO_REPARSE_TAG_LX_FIFO) { - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_FIFO; - } else if (reparse_tag == IO_REPARSE_TAG_AF_UNIX) { - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_SOCK; - } else if (reparse_tag == IO_REPARSE_TAG_LX_CHR) { - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_CHR; - } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) { - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_BLK; - } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK || - reparse_tag == IO_REPARSE_TAG_NFS) { - fattr->cf_mode = S_IFLNK; - fattr->cf_dtype = DT_LNK; - } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { + + if (cifs_open_data_reparse(data) && + cifs_reparse_point_to_fattr(cifs_sb, fattr, data)) + goto out_reparse; + + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->ctx->dir_mode; fattr->cf_dtype = DT_DIR; /* @@ -769,7 +863,10 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i } } +out_reparse: if (S_ISLNK(fattr->cf_mode)) { + if (likely(data->symlink_target)) + fattr->cf_eof = strnlen(data->symlink_target, PATH_MAX); fattr->cf_symlink_target = data->symlink_target; data->symlink_target = NULL; } @@ -789,8 +886,6 @@ cifs_get_file_info(struct file *filp) struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - bool symlink = false; - u32 tag = 0; if (!server->ops->query_file_info) return -ENOSYS; @@ -800,14 +895,15 @@ cifs_get_file_info(struct file *filp) switch (rc) { case 0: /* TODO: add support to query reparse tag */ + data.adjust_tz = false; if (data.symlink_target) { - symlink = true; - tag = IO_REPARSE_TAG_SYMLINK; + data.symlink = true; + data.reparse.tag = IO_REPARSE_TAG_SYMLINK; } - cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag); + cifs_open_info_to_fattr(&fattr, &data, inode->i_sb); break; case -EREMOTE: - cifs_create_dfs_fattr(&fattr, inode->i_sb); + cifs_create_junction_fattr(&fattr, inode->i_sb); rc = 0; break; case -EOPNOTSUPP: @@ -960,22 +1056,72 @@ static inline bool is_inode_cache_good(struct inode *ino) return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0; } -int cifs_get_inode_info(struct inode **inode, const char *full_path, - struct cifs_open_info_data *data, struct super_block *sb, int xid, - const struct cifs_fid *fid) +static int reparse_info_to_fattr(struct cifs_open_info_data *data, + struct super_block *sb, + const unsigned int xid, + struct cifs_tcon *tcon, + const char *full_path, + struct cifs_fattr *fattr) { + struct TCP_Server_Info *server = tcon->ses->server; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct kvec rsp_iov, *iov = NULL; + int rsp_buftype = CIFS_NO_BUFFER; + u32 tag = data->reparse.tag; + int rc = 0; + + if (!tag && server->ops->query_reparse_point) { + rc = server->ops->query_reparse_point(xid, tcon, cifs_sb, + full_path, &tag, + &rsp_iov, &rsp_buftype); + if (!rc) + iov = &rsp_iov; + } + + rc = -EOPNOTSUPP; + switch ((data->reparse.tag = tag)) { + case 0: /* SMB1 symlink */ + if (server->ops->query_symlink) { + rc = server->ops->query_symlink(xid, tcon, + cifs_sb, full_path, + &data->symlink_target); + } + break; + case IO_REPARSE_TAG_MOUNT_POINT: + cifs_create_junction_fattr(fattr, sb); + rc = 0; + goto out; + default: + if (data->symlink_target) { + rc = 0; + } else if (server->ops->parse_reparse_point) { + rc = server->ops->parse_reparse_point(cifs_sb, + iov, data); + } + break; + } + + cifs_open_info_to_fattr(fattr, data, sb); +out: + free_rsp_buf(rsp_buftype, rsp_iov.iov_base); + return rc; +} + +static int cifs_get_fattr(struct cifs_open_info_data *data, + struct super_block *sb, int xid, + const struct cifs_fid *fid, + struct cifs_fattr *fattr, + struct inode **inode, + const char *full_path) +{ + struct cifs_open_info_data tmp_data = {}; struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct tcon_link *tlink; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - bool adjust_tz = false; - struct cifs_fattr fattr = {0}; - bool is_reparse_point = false; - struct cifs_open_info_data tmp_data = {}; void *smb1_backup_rsp_buf = NULL; int rc = 0; int tmprc = 0; - __u32 reparse_tag = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -988,12 +1134,8 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, */ if (!data) { - if (is_inode_cache_good(*inode)) { - cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); - goto out; - } - rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data, - &adjust_tz, &is_reparse_point); + rc = server->ops->query_path_info(xid, tcon, cifs_sb, + full_path, &tmp_data); data = &tmp_data; } @@ -1008,28 +1150,16 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, * since we have to check if its reparse tag matches a known * special file type e.g. symlink or fifo or char etc. */ - if (is_reparse_point && data->symlink_target) { - reparse_tag = IO_REPARSE_TAG_SYMLINK; - } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) && - server->ops->query_reparse_tag) { - tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path, - &reparse_tag); - if (tmprc) - cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc); - if (server->ops->query_symlink) { - tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path, - &data->symlink_target, - is_reparse_point); - if (tmprc) - cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__, - tmprc); - } + if (cifs_open_data_reparse(data)) { + rc = reparse_info_to_fattr(data, sb, xid, tcon, + full_path, fattr); + } else { + cifs_open_info_to_fattr(fattr, data, sb); } - cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag); break; case -EREMOTE: /* DFS link, no metadata available on this server */ - cifs_create_dfs_fattr(&fattr, sb); + cifs_create_junction_fattr(fattr, sb); rc = 0; break; case -EACCES: @@ -1059,8 +1189,8 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, fdi = (FILE_DIRECTORY_INFO *)fi; si = (SEARCH_ID_FULL_DIR_INFO *)fi; - cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu(si->UniqueId); + cifs_dir_info_to_fattr(fattr, fdi, cifs_sb); + fattr->cf_uniqueid = le64_to_cpu(si->UniqueId); /* uniqueid set, skip get inum step */ goto handle_mnt_opt; } else { @@ -1077,10 +1207,10 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, } /* - * 3. Get or update inode number (fattr.cf_uniqueid) + * 3. Get or update inode number (fattr->cf_uniqueid) */ - cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, &fattr); + cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, fattr); /* * 4. Tweak fattr based on mount options @@ -1089,17 +1219,17 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path, handle_mnt_opt: #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* query for SFU type info if supported and needed */ - if (fattr.cf_cifsattrs & ATTR_SYSTEM && - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { - tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid); + if ((fattr->cf_cifsattrs & ATTR_SYSTEM) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) { + tmprc = cifs_sfu_type(fattr, full_path, cifs_sb, xid); if (tmprc) cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc); } /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) { - rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true, - full_path, fid); + rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode, + true, full_path, fid); if (rc == -EREMOTE) rc = 0; if (rc) { @@ -1108,8 +1238,8 @@ handle_mnt_opt: goto out; } } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false, - full_path, fid); + rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode, + false, full_path, fid); if (rc == -EREMOTE) rc = 0; if (rc) { @@ -1121,60 +1251,57 @@ handle_mnt_opt: /* fill in remaining high mode bits e.g. SUID, VTX */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) - cifs_sfu_mode(&fattr, full_path, cifs_sb, xid); + cifs_sfu_mode(fattr, full_path, cifs_sb, xid); /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, - full_path); - if (tmprc) - cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } - /* - * 5. Update inode with final fattr data - */ - - if (!*inode) { - *inode = cifs_iget(sb, &fattr); - if (!*inode) - rc = -ENOMEM; - } else { - /* we already have inode, update it */ - - /* if uniqueid is different, return error */ - if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && - CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - /* if filetype is different, return error */ - rc = cifs_fattr_to_inode(*inode, &fattr); - } out: cifs_buf_release(smb1_backup_rsp_buf); cifs_put_tlink(tlink); cifs_free_open_info(&tmp_data); + return rc; +} + +int cifs_get_inode_info(struct inode **inode, + const char *full_path, + struct cifs_open_info_data *data, + struct super_block *sb, int xid, + const struct cifs_fid *fid) +{ + struct cifs_fattr fattr = {}; + int rc; + + if (is_inode_cache_good(*inode)) { + cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); + return 0; + } + + rc = cifs_get_fattr(data, sb, xid, fid, &fattr, inode, full_path); + if (rc) + goto out; + + rc = update_inode_info(sb, &fattr, inode); +out: kfree(fattr.cf_symlink_target); return rc; } -int -smb311_posix_get_inode_info(struct inode **inode, - const char *full_path, - struct super_block *sb, unsigned int xid) +static int smb311_posix_get_fattr(struct cifs_fattr *fattr, + const char *full_path, + struct super_block *sb, + const unsigned int xid) { + struct cifs_open_info_data data = {}; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon; struct tcon_link *tlink; - struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - bool adjust_tz = false; - struct cifs_fattr fattr = {0}; - bool symlink = false; - struct cifs_open_info_data data = {}; struct cifs_sid owner, group; - int rc = 0; - int tmprc = 0; + int tmprc; + int rc; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -1185,14 +1312,9 @@ smb311_posix_get_inode_info(struct inode **inode, * 1. Fetch file metadata */ - if (is_inode_cache_good(*inode)) { - cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); - goto out; - } - - rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, - &owner, &group, &adjust_tz, - &symlink); + rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, + full_path, &data, + &owner, &group); /* * 2. Convert it to internal cifs metadata (fattr) @@ -1200,12 +1322,11 @@ smb311_posix_get_inode_info(struct inode **inode, switch (rc) { case 0: - smb311_posix_info_to_fattr(&fattr, &data, &owner, &group, - sb, adjust_tz, symlink); + smb311_posix_info_to_fattr(fattr, &data, &owner, &group, sb); break; case -EREMOTE: /* DFS link, no metadata available on this server */ - cifs_create_dfs_fattr(&fattr, sb); + cifs_create_junction_fattr(fattr, sb); rc = 0; break; case -EACCES: @@ -1221,49 +1342,42 @@ smb311_posix_get_inode_info(struct inode **inode, goto out; } - /* * 3. Tweak fattr based on mount options */ - /* check for Minshall+French symlinks */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) { - tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr, - full_path); - if (tmprc) - cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); + tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path); + cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc); } - /* - * 4. Update inode with final fattr data - */ - - if (!*inode) { - *inode = cifs_iget(sb, &fattr); - if (!*inode) - rc = -ENOMEM; - } else { - /* we already have inode, update it */ +out: + cifs_put_tlink(tlink); + cifs_free_open_info(&data); + return rc; +} - /* if uniqueid is different, return error */ - if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && - CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } +int smb311_posix_get_inode_info(struct inode **inode, const char *full_path, + struct super_block *sb, const unsigned int xid) +{ + struct cifs_fattr fattr = {}; + int rc; - /* if filetype is different, return error */ - rc = cifs_fattr_to_inode(*inode, &fattr); + if (is_inode_cache_good(*inode)) { + cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); + return 0; } + + rc = smb311_posix_get_fattr(&fattr, full_path, sb, xid); + if (rc) + goto out; + + rc = update_inode_info(sb, &fattr, inode); out: - cifs_put_tlink(tlink); - cifs_free_open_info(&data); kfree(fattr.cf_symlink_target); return rc; } - static const struct inode_operations cifs_ipc_inode_ops = { .lookup = cifs_lookup, }; @@ -1367,13 +1481,14 @@ retry_iget5_locked: /* gets root inode */ struct inode *cifs_root_iget(struct super_block *sb) { - unsigned int xid; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - struct inode *inode = NULL; - long rc; + struct cifs_fattr fattr = {}; struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct inode *inode = NULL; + unsigned int xid; char *path = NULL; int len; + int rc; if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && cifs_sb->prepath) { @@ -1391,21 +1506,29 @@ struct inode *cifs_root_iget(struct super_block *sb) xid = get_xid(); if (tcon->unix_ext) { - rc = cifs_get_inode_info_unix(&inode, path, sb, xid); + rc = cifs_get_unix_fattr(path, sb, &fattr, &inode, xid); /* some servers mistakenly claim POSIX support */ if (rc != -EOPNOTSUPP) - goto iget_no_retry; + goto iget_root; cifs_dbg(VFS, "server does not support POSIX extensions\n"); tcon->unix_ext = false; } convert_delimiter(path, CIFS_DIR_SEP(cifs_sb)); if (tcon->posix_extensions) - rc = smb311_posix_get_inode_info(&inode, path, sb, xid); + rc = smb311_posix_get_fattr(&fattr, path, sb, xid); else - rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL); + rc = cifs_get_fattr(NULL, sb, xid, NULL, &fattr, &inode, path); + +iget_root: + if (!rc) { + if (fattr.cf_flags & CIFS_FATTR_JUNCTION) { + fattr.cf_flags &= ~CIFS_FATTR_JUNCTION; + cifs_autodisable_serverino(cifs_sb); + } + inode = cifs_iget(sb, &fattr); + } -iget_no_retry: if (!inode) { inode = ERR_PTR(rc); goto out; @@ -1429,6 +1552,7 @@ iget_no_retry: out: kfree(path); free_xid(xid); + kfree(fattr.cf_symlink_target); return inode; } @@ -1744,9 +1868,9 @@ out_reval: cifs_inode = CIFS_I(inode); cifs_inode->time = 0; /* will force revalidate to get info when needed */ - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: @@ -2060,8 +2184,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) */ cifsInode->time = 0; - d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = - current_time(inode); + inode_set_ctime_current(d_inode(direntry)); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); rmdir_exit: free_dentry_path(page); @@ -2267,9 +2391,6 @@ unlink_target: /* force revalidate to go get info when needed */ CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; - source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = - target_dir->i_mtime = current_time(source_dir); - cifs_rename_exit: kfree(info_buf_source); free_dentry_path(page2); @@ -2540,7 +2661,7 @@ int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, return rc; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; @@ -2610,7 +2731,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, } cifsFileInfo_put(cfile); - return -ENOTSUPP; + return -EOPNOTSUPP; } int cifs_truncate_page(struct address_space *mapping, loff_t from) diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index f7160003e0ed..e2f92c21fff5 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -117,6 +117,20 @@ out_drop_write: return rc; } +static long smb_mnt_get_tcon_info(struct cifs_tcon *tcon, void __user *arg) +{ + int rc = 0; + struct smb_mnt_tcon_info tcon_inf; + + tcon_inf.tid = tcon->tid; + tcon_inf.session_id = tcon->ses->Suid; + + if (copy_to_user(arg, &tcon_inf, sizeof(struct smb_mnt_tcon_info))) + rc = -EFAULT; + + return rc; +} + static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, void __user *arg) { @@ -129,6 +143,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, fsinf->version = 1; fsinf->protocol_id = tcon->ses->server->vals->protocol_id; + fsinf->tcon_flags = tcon->Flags; fsinf->device_characteristics = le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics); fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType); @@ -414,6 +429,17 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; + case CIFS_IOC_GET_TCON_INFO: + cifs_sb = CIFS_SB(inode->i_sb); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); + rc = smb_mnt_get_tcon_info(tcon, (void __user *)arg); + cifs_put_tlink(tlink); + break; case CIFS_ENUMERATE_SNAPSHOTS: if (pSMBFile == NULL) break; diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index c66be4904e1f..a1da50e66fbb 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -42,23 +42,11 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) rc = cifs_alloc_hash("md5", &md5); if (rc) - goto symlink_hash_err; + return rc; - rc = crypto_shash_init(md5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init md5 shash\n", __func__); - goto symlink_hash_err; - } - rc = crypto_shash_update(md5, link_str, link_len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); - goto symlink_hash_err; - } - rc = crypto_shash_final(md5, md5_hash); + rc = crypto_shash_digest(md5, link_str, link_len, md5_hash); if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); - -symlink_hash_err: cifs_free_hash(&md5); return rc; } diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index d7e85d9a2655..c2137ea3c253 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -113,18 +113,22 @@ sesInfoFree(struct cifs_ses *buf_to_free) } struct cifs_tcon * -tconInfoAlloc(void) +tcon_info_alloc(bool dir_leases_enabled) { struct cifs_tcon *ret_buf; ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfids = init_cached_dirs(); - if (!ret_buf->cfids) { - kfree(ret_buf); - return NULL; + + if (dir_leases_enabled == true) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { + kfree(ret_buf); + return NULL; + } } + /* else ret_buf->cfids is already set to NULL above */ atomic_inc(&tconInfoAllocCount); ret_buf->status = TID_NEW; @@ -359,6 +363,10 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) cifs_dbg(VFS, "Length less than smb header size\n"); } return -EIO; + } else if (total_read < sizeof(*smb) + 2 * smb->WordCount) { + cifs_dbg(VFS, "%s: can't read BCC due to invalid WordCount(%u)\n", + __func__, smb->WordCount); + return -EIO; } /* otherwise, there is enough to get to the BCC */ @@ -476,7 +484,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) return false; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv; + pserver = SERVER_IS_CHAN(srv) ? srv->primary_server : srv; /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); diff --git a/fs/smb/client/cifs_dfs_ref.c b/fs/smb/client/namespace.c index b1c2499b1c3b..a6968573b775 100644 --- a/fs/smb/client/cifs_dfs_ref.c +++ b/fs/smb/client/namespace.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Contains the CIFS DFS referral mounting routines used for handling - * traversal via DFS junction point + * Contains mounting routines used for handling traversal via SMB junctions. * * Copyright (c) 2007 Igor Mammedov * Copyright (C) International Business Machines Corp., 2008 * Author(s): Igor Mammedov (niallain@gmail.com) * Steve French (sfrench@us.ibm.com) + * Copyright (c) 2023 Paulo Alcantara <palcantara@suse.de> */ #include <linux/dcache.h> @@ -19,32 +19,31 @@ #include "cifsglob.h" #include "cifsproto.h" #include "cifsfs.h" -#include "dns_resolve.h" #include "cifs_debug.h" -#include "dfs.h" #include "fs_context.h" -static LIST_HEAD(cifs_dfs_automount_list); +static LIST_HEAD(cifs_automount_list); -static void cifs_dfs_expire_automounts(struct work_struct *work); -static DECLARE_DELAYED_WORK(cifs_dfs_automount_task, - cifs_dfs_expire_automounts); -static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ; +static void cifs_expire_automounts(struct work_struct *work); +static DECLARE_DELAYED_WORK(cifs_automount_task, + cifs_expire_automounts); +static int cifs_mountpoint_expiry_timeout = 500 * HZ; -static void cifs_dfs_expire_automounts(struct work_struct *work) +static void cifs_expire_automounts(struct work_struct *work) { - struct list_head *list = &cifs_dfs_automount_list; + struct list_head *list = &cifs_automount_list; mark_mounts_for_expiry(list); if (!list_empty(list)) - schedule_delayed_work(&cifs_dfs_automount_task, - cifs_dfs_mountpoint_expiry_timeout); + schedule_delayed_work(&cifs_automount_task, + cifs_mountpoint_expiry_timeout); } -void cifs_dfs_release_automount_timer(void) +void cifs_release_automount_timer(void) { - BUG_ON(!list_empty(&cifs_dfs_automount_list)); - cancel_delayed_work_sync(&cifs_dfs_automount_task); + if (WARN_ON(!list_empty(&cifs_automount_list))) + return; + cancel_delayed_work_sync(&cifs_automount_task); } /** @@ -118,26 +117,65 @@ cifs_build_devname(char *nodename, const char *prepath) return dev; } -static int set_dest_addr(struct smb3_fs_context *ctx) +static bool is_dfs_mount(struct dentry *dentry) { - struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; - int rc; + struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + bool ret; + + spin_lock(&tcon->tc_lock); + ret = !!tcon->origin_fullpath; + spin_unlock(&tcon->tc_lock); + return ret; +} + +/* Return full path out of a dentry set for automount */ +static char *automount_fullpath(struct dentry *dentry, void *page) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + size_t len; + char *s; + + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + spin_unlock(&tcon->tc_lock); + return build_path_from_dentry_optional_prefix(dentry, + page, + true); + } + spin_unlock(&tcon->tc_lock); + + s = dentry_path_raw(dentry, page, PATH_MAX); + if (IS_ERR(s)) + return s; + /* for root, we want "" */ + if (!s[1]) + s++; + + spin_lock(&tcon->tc_lock); + len = strlen(tcon->origin_fullpath); + if (s < (char *)page + len) { + spin_unlock(&tcon->tc_lock); + return ERR_PTR(-ENAMETOOLONG); + } - rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL); - if (!rc) - cifs_set_port(addr, ctx->port); - return rc; + s -= len; + memcpy(s, tcon->origin_fullpath, len); + spin_unlock(&tcon->tc_lock); + convert_delimiter(s, '/'); + + return s; } /* * Create a vfsmount that we can automount */ -static struct vfsmount *cifs_dfs_do_automount(struct path *path) +static struct vfsmount *cifs_do_automount(struct path *path) { int rc; struct dentry *mntpt = path->dentry; struct fs_context *fc; - struct cifs_sb_info *cifs_sb; void *page = NULL; struct smb3_fs_context *ctx, *cur_ctx; struct smb3_fs_context tmp; @@ -147,17 +185,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) if (IS_ROOT(mntpt)) return ERR_PTR(-ESTALE); - /* - * The MSDFS spec states that paths in DFS referral requests and - * responses must be prefixed by a single '\' character instead of - * the double backslashes usually used in the UNC. This function - * gives us the latter, so we must adjust the result. - */ - cifs_sb = CIFS_SB(mntpt->d_sb); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) - return ERR_PTR(-EREMOTE); - - cur_ctx = cifs_sb->ctx; + cur_ctx = CIFS_SB(mntpt->d_sb)->ctx; fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt); if (IS_ERR(fc)) @@ -166,7 +194,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) ctx = smb3_fc2context(fc); page = alloc_dentry_path(); - full_path = dfs_get_automount_devname(mntpt, page); + full_path = automount_fullpath(mntpt, page); if (IS_ERR(full_path)) { mnt = ERR_CAST(full_path); goto out; @@ -196,15 +224,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) ctx->source = NULL; goto out; } - cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dstaddr=%pISpc\n", - __func__, ctx->source, ctx->UNC, ctx->prepath, &ctx->dstaddr); - - rc = set_dest_addr(ctx); - if (!rc) - mnt = fc_mount(fc); - else - mnt = ERR_PTR(rc); + ctx->dfs_automount = is_dfs_mount(mntpt); + cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dfs_automount=%d\n", + __func__, ctx->source, ctx->UNC, ctx->prepath, ctx->dfs_automount); + mnt = fc_mount(fc); out: put_fs_context(fc); free_dentry_path(page); @@ -214,25 +238,25 @@ out: /* * Attempt to automount the referral */ -struct vfsmount *cifs_dfs_d_automount(struct path *path) +struct vfsmount *cifs_d_automount(struct path *path) { struct vfsmount *newmnt; cifs_dbg(FYI, "%s: %pd\n", __func__, path->dentry); - newmnt = cifs_dfs_do_automount(path); + newmnt = cifs_do_automount(path); if (IS_ERR(newmnt)) { cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__); return newmnt; } mntget(newmnt); /* prevent immediate expiration */ - mnt_set_expiry(newmnt, &cifs_dfs_automount_list); - schedule_delayed_work(&cifs_dfs_automount_task, - cifs_dfs_mountpoint_expiry_timeout); + mnt_set_expiry(newmnt, &cifs_automount_list); + schedule_delayed_work(&cifs_automount_task, + cifs_mountpoint_expiry_timeout); cifs_dbg(FYI, "leaving %s [ok]\n" , __func__); return newmnt; } -const struct inode_operations cifs_dfs_referral_inode_operations = { +const struct inode_operations cifs_namespace_inode_operations = { }; diff --git a/fs/smb/client/ntlmssp.h b/fs/smb/client/ntlmssp.h index 2c5dde2ece58..875de43b72de 100644 --- a/fs/smb/client/ntlmssp.h +++ b/fs/smb/client/ntlmssp.h @@ -133,8 +133,8 @@ typedef struct _AUTHENTICATE_MESSAGE { SECURITY_BUFFER WorkstationName; SECURITY_BUFFER SessionKey; __le32 NegotiateFlags; - /* SECURITY_BUFFER for version info not present since we - do not set the version is present flag */ + struct ntlmssp_version Version; + /* SECURITY_BUFFER */ char UserString[]; } __attribute__((packed)) AUTHENTICATE_MESSAGE, *PAUTHENTICATE_MESSAGE; diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index ef638086d734..d30ea2005eb3 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -143,6 +143,7 @@ static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) case IO_REPARSE_TAG_DFSR: case IO_REPARSE_TAG_SYMLINK: case IO_REPARSE_TAG_NFS: + case IO_REPARSE_TAG_MOUNT_POINT: case 0: return true; } @@ -152,6 +153,10 @@ static bool reparse_file_needs_reval(const struct cifs_fattr *fattr) static void cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) { + struct cifs_open_info_data data = { + .reparse = { .tag = fattr->cf_cifstag, }, + }; + fattr->cf_uid = cifs_sb->ctx->linux_uid; fattr->cf_gid = cifs_sb->ctx->linux_gid; @@ -163,29 +168,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) * TODO: go through all documented reparse tags to see if we can * reasonably map some of them to directories vs. files vs. symlinks */ + if ((fattr->cf_cifsattrs & ATTR_REPARSE) && + cifs_reparse_point_to_fattr(cifs_sb, fattr, &data)) + goto out_reparse; + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->ctx->dir_mode; fattr->cf_dtype = DT_DIR; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_SYMLINK) { - fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_LNK; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_FIFO) { - fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_FIFO; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_AF_UNIX) { - fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_SOCK; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_CHR) { - fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_CHR; - } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_BLK) { - fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode; - fattr->cf_dtype = DT_BLK; - } else { /* TODO: should we mark some other reparse points (like DFSR) as directories? */ + } else { fattr->cf_mode = S_IFREG | cifs_sb->ctx->file_mode; fattr->cf_dtype = DT_REG; } +out_reparse: /* * We need to revalidate it further to make a decision about whether it * is a symbolic link, DFS referral or a reparse point with a direct diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index c57ca2050b73..2d3b332a79a1 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -24,7 +24,7 @@ #include "fs_context.h" static int -cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, +cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface); bool @@ -69,7 +69,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) /* channel helper functions. assumed that chan_lock is held by caller. */ -unsigned int +int cifs_ses_get_chan_index(struct cifs_ses *ses, struct TCP_Server_Info *server) { @@ -85,14 +85,17 @@ cifs_ses_get_chan_index(struct cifs_ses *ses, cifs_dbg(VFS, "unable to get chan index for server: 0x%llx", server->conn_id); WARN_ON(1); - return 0; + return CIFS_INVAL_CHAN_INDEX; } void cifs_chan_set_in_reconnect(struct cifs_ses *ses, struct TCP_Server_Info *server) { - unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + int chan_index = cifs_ses_get_chan_index(ses, server); + + if (chan_index == CIFS_INVAL_CHAN_INDEX) + return; ses->chans[chan_index].in_reconnect = true; } @@ -103,6 +106,9 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses, { unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) + return; + ses->chans[chan_index].in_reconnect = false; } @@ -112,6 +118,9 @@ cifs_chan_in_reconnect(struct cifs_ses *ses, { unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) + return true; /* err on the safer side */ + return CIFS_CHAN_IN_RECONNECT(ses, chan_index); } @@ -121,6 +130,9 @@ cifs_chan_set_need_reconnect(struct cifs_ses *ses, { unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) + return; + set_bit(chan_index, &ses->chans_need_reconnect); cifs_dbg(FYI, "Set reconnect bitmask for chan %u; now 0x%lx\n", chan_index, ses->chans_need_reconnect); @@ -132,6 +144,9 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses, { unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) + return; + clear_bit(chan_index, &ses->chans_need_reconnect); cifs_dbg(FYI, "Cleared reconnect bitmask for chan %u; now 0x%lx\n", chan_index, ses->chans_need_reconnect); @@ -143,6 +158,9 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses, { unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) + return true; /* err on the safer side */ + return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index); } @@ -152,19 +170,24 @@ cifs_chan_is_iface_active(struct cifs_ses *ses, { unsigned int chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) + return true; /* err on the safer side */ + return ses->chans[chan_index].iface && ses->chans[chan_index].iface->is_active; } /* returns number of channels added */ -int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) +int cifs_try_adding_channels(struct cifs_ses *ses) { struct TCP_Server_Info *server = ses->server; int old_chan_count, new_chan_count; int left; int rc = 0; int tries = 0; + size_t iface_weight = 0, iface_min_speed = 0; struct cifs_server_iface *iface = NULL, *niface = NULL; + struct cifs_server_iface *last_iface = NULL; spin_lock(&ses->chan_lock); @@ -186,28 +209,17 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) } if (!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - ses->chan_max = 1; spin_unlock(&ses->chan_lock); cifs_server_dbg(VFS, "no multichannel support\n"); return 0; } spin_unlock(&ses->chan_lock); - /* - * Keep connecting to same, fastest, iface for all channels as - * long as its RSS. Try next fastest one if not RSS or channel - * creation fails. - */ - spin_lock(&ses->iface_lock); - iface = list_first_entry(&ses->iface_list, struct cifs_server_iface, - iface_head); - spin_unlock(&ses->iface_lock); - while (left > 0) { tries++; if (tries > 3*ses->chan_max) { - cifs_dbg(FYI, "too many channel open attempts (%d channels left to open)\n", + cifs_dbg(VFS, "too many channel open attempts (%d channels left to open)\n", left); break; } @@ -215,23 +227,41 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) spin_lock(&ses->iface_lock); if (!ses->iface_count) { spin_unlock(&ses->iface_lock); + cifs_dbg(VFS, "server %s does not advertise interfaces\n", + ses->server->hostname); break; } + if (!iface) + iface = list_first_entry(&ses->iface_list, struct cifs_server_iface, + iface_head); + last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface, + iface_head); + iface_min_speed = last_iface->speed; + list_for_each_entry_safe_from(iface, niface, &ses->iface_list, iface_head) { + /* do not mix rdma and non-rdma interfaces */ + if (iface->rdma_capable != ses->server->rdma) + continue; + /* skip ifaces that are unusable */ if (!iface->is_active || (is_ses_using_iface(ses, iface) && - !iface->rss_capable)) { + !iface->rss_capable)) + continue; + + /* check if we already allocated enough channels */ + iface_weight = iface->speed / iface_min_speed; + + if (iface->weight_fulfilled >= iface_weight) continue; - } /* take ref before unlock */ kref_get(&iface->refcount); spin_unlock(&ses->iface_lock); - rc = cifs_ses_add_channel(cifs_sb, ses, iface); + rc = cifs_ses_add_channel(ses, iface); spin_lock(&ses->iface_lock); if (rc) { @@ -242,10 +272,21 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) continue; } - cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n", + iface->num_channels++; + iface->weight_fulfilled++; + cifs_dbg(VFS, "successfully opened new channel on iface:%pIS\n", &iface->sockaddr); break; } + + /* reached end of list. reset weight_fulfilled and start over */ + if (list_entry_is_head(iface, &ses->iface_list, iface_head)) { + list_for_each_entry(iface, &ses->iface_list, iface_head) + iface->weight_fulfilled = 0; + spin_unlock(&ses->iface_lock); + iface = NULL; + continue; + } spin_unlock(&ses->iface_lock); left--; @@ -256,6 +297,64 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) } /* + * called when multichannel is disabled by the server. + * this always gets called from smb2_reconnect + * and cannot get called in parallel threads. + */ +void +cifs_disable_secondary_channels(struct cifs_ses *ses) +{ + int i, chan_count; + struct TCP_Server_Info *server; + struct cifs_server_iface *iface; + + spin_lock(&ses->chan_lock); + chan_count = ses->chan_count; + if (chan_count == 1) + goto done; + + ses->chan_count = 1; + + /* for all secondary channels reset the need reconnect bit */ + ses->chans_need_reconnect &= 1; + + for (i = 1; i < chan_count; i++) { + iface = ses->chans[i].iface; + server = ses->chans[i].server; + + /* + * remove these references first, since we need to unlock + * the chan_lock here, since iface_lock is a higher lock + */ + ses->chans[i].iface = NULL; + ses->chans[i].server = NULL; + spin_unlock(&ses->chan_lock); + + if (iface) { + spin_lock(&ses->iface_lock); + iface->num_channels--; + if (iface->weight_fulfilled) + iface->weight_fulfilled--; + kref_put(&iface->refcount, release_iface); + spin_unlock(&ses->iface_lock); + } + + if (server) { + if (!server->terminate) { + server->terminate = true; + cifs_signal_cifsd_for_reconnect(server, false); + } + cifs_put_tcp_session(server, false); + } + + spin_lock(&ses->chan_lock); + } + +done: + spin_unlock(&ses->chan_lock); +} + +/* * update the iface for the channel if necessary. * will return 0 when iface is updated, 1 if removed, 2 otherwise * Must be called with chan_lock held. @@ -264,13 +363,16 @@ int cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) { unsigned int chan_index; + size_t iface_weight = 0, iface_min_speed = 0; struct cifs_server_iface *iface = NULL; struct cifs_server_iface *old_iface = NULL; + struct cifs_server_iface *last_iface = NULL; + struct sockaddr_storage ss; int rc = 0; spin_lock(&ses->chan_lock); chan_index = cifs_ses_get_chan_index(ses, server); - if (!chan_index) { + if (chan_index == CIFS_INVAL_CHAN_INDEX) { spin_unlock(&ses->chan_lock); return 0; } @@ -284,14 +386,49 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) } spin_unlock(&ses->chan_lock); + spin_lock(&server->srv_lock); + ss = server->dstaddr; + spin_unlock(&server->srv_lock); + spin_lock(&ses->iface_lock); + if (!ses->iface_count) { + spin_unlock(&ses->iface_lock); + cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname); + return 0; + } + + last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface, + iface_head); + iface_min_speed = last_iface->speed; + /* then look for a new one */ list_for_each_entry(iface, &ses->iface_list, iface_head) { + if (!chan_index) { + /* if we're trying to get the updated iface for primary channel */ + if (!cifs_match_ipaddr((struct sockaddr *) &ss, + (struct sockaddr *) &iface->sockaddr)) + continue; + + kref_get(&iface->refcount); + break; + } + + /* do not mix rdma and non-rdma interfaces */ + if (iface->rdma_capable != server->rdma) + continue; + if (!iface->is_active || (is_ses_using_iface(ses, iface) && !iface->rss_capable)) { continue; } + + /* check if we already allocated enough channels */ + iface_weight = iface->speed / iface_min_speed; + + if (iface->weight_fulfilled >= iface_weight) + continue; + kref_get(&iface->refcount); break; } @@ -302,34 +439,52 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) cifs_dbg(FYI, "unable to find a suitable iface\n"); } + if (!iface) { + cifs_dbg(FYI, "unable to get the interface matching: %pIS\n", + &ss); + spin_unlock(&ses->iface_lock); + return 0; + } + /* now drop the ref to the current iface */ - if (old_iface && iface) { + if (old_iface) { cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n", &old_iface->sockaddr, &iface->sockaddr); + + old_iface->num_channels--; + if (old_iface->weight_fulfilled) + old_iface->weight_fulfilled--; + iface->num_channels++; + iface->weight_fulfilled++; + kref_put(&old_iface->refcount, release_iface); } else if (old_iface) { - cifs_dbg(FYI, "releasing ref to iface: %pIS\n", + /* if a new candidate is not found, keep things as is */ + cifs_dbg(FYI, "could not replace iface: %pIS\n", &old_iface->sockaddr); - kref_put(&old_iface->refcount, release_iface); - } else { - WARN_ON(!iface); - cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr); + } else if (!chan_index) { + /* special case: update interface for primary channel */ + if (iface) { + cifs_dbg(FYI, "referencing primary channel iface: %pIS\n", + &iface->sockaddr); + iface->num_channels++; + iface->weight_fulfilled++; + } } spin_unlock(&ses->iface_lock); - spin_lock(&ses->chan_lock); - chan_index = cifs_ses_get_chan_index(ses, server); - ses->chans[chan_index].iface = iface; - - /* No iface is found. if secondary chan, drop connection */ - if (!iface && CIFS_SERVER_IS_CHAN(server)) - ses->chans[chan_index].server = NULL; - - spin_unlock(&ses->chan_lock); + if (iface) { + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + return 0; + } - if (!iface && CIFS_SERVER_IS_CHAN(server)) - cifs_put_tcp_session(server, false); + ses->chans[chan_index].iface = iface; + spin_unlock(&ses->chan_lock); + } return rc; } @@ -355,16 +510,16 @@ cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server) } static int -cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, +cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) { struct TCP_Server_Info *chan_server; struct cifs_chan *chan; - struct smb3_fs_context ctx = {NULL}; + struct smb3_fs_context *ctx; static const char unc_fmt[] = "\\%s\\foo"; - char unc[sizeof(unc_fmt)+SERVER_NAME_LEN_WITH_NULL] = {0}; struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; + size_t len; int rc; unsigned int xid = get_xid(); @@ -388,54 +543,64 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, * the session and server without caring about memory * management. */ + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + rc = -ENOMEM; + goto out_free_xid; + } /* Always make new connection for now (TODO?) */ - ctx.nosharesock = true; + ctx->nosharesock = true; /* Auth */ - ctx.domainauto = ses->domainAuto; - ctx.domainname = ses->domainName; + ctx->domainauto = ses->domainAuto; + ctx->domainname = ses->domainName; /* no hostname for extra channels */ - ctx.server_hostname = ""; + ctx->server_hostname = ""; - ctx.username = ses->user_name; - ctx.password = ses->password; - ctx.sectype = ses->sectype; - ctx.sign = ses->sign; + ctx->username = ses->user_name; + ctx->password = ses->password; + ctx->sectype = ses->sectype; + ctx->sign = ses->sign; /* UNC and paths */ /* XXX: Use ses->server->hostname? */ - sprintf(unc, unc_fmt, ses->ip_addr); - ctx.UNC = unc; - ctx.prepath = ""; + len = sizeof(unc_fmt) + SERVER_NAME_LEN_WITH_NULL; + ctx->UNC = kzalloc(len, GFP_KERNEL); + if (!ctx->UNC) { + rc = -ENOMEM; + goto out_free_ctx; + } + scnprintf(ctx->UNC, len, unc_fmt, ses->ip_addr); + ctx->prepath = ""; /* Reuse same version as master connection */ - ctx.vals = ses->server->vals; - ctx.ops = ses->server->ops; + ctx->vals = ses->server->vals; + ctx->ops = ses->server->ops; - ctx.noblocksnd = ses->server->noblocksnd; - ctx.noautotune = ses->server->noautotune; - ctx.sockopt_tcp_nodelay = ses->server->tcp_nodelay; - ctx.echo_interval = ses->server->echo_interval / HZ; - ctx.max_credits = ses->server->max_credits; + ctx->noblocksnd = ses->server->noblocksnd; + ctx->noautotune = ses->server->noautotune; + ctx->sockopt_tcp_nodelay = ses->server->tcp_nodelay; + ctx->echo_interval = ses->server->echo_interval / HZ; + ctx->max_credits = ses->server->max_credits; /* * This will be used for encoding/decoding user/domain/pw * during sess setup auth. */ - ctx.local_nls = cifs_sb->local_nls; + ctx->local_nls = ses->local_nls; /* Use RDMA if possible */ - ctx.rdma = iface->rdma_capable; - memcpy(&ctx.dstaddr, &iface->sockaddr, sizeof(struct sockaddr_storage)); + ctx->rdma = iface->rdma_capable; + memcpy(&ctx->dstaddr, &iface->sockaddr, sizeof(ctx->dstaddr)); /* reuse master con client guid */ - memcpy(&ctx.client_guid, ses->server->client_guid, - SMB2_CLIENT_GUID_SIZE); - ctx.use_client_guid = true; + memcpy(&ctx->client_guid, ses->server->client_guid, + sizeof(ctx->client_guid)); + ctx->use_client_guid = true; - chan_server = cifs_get_tcp_session(&ctx, ses->server); + chan_server = cifs_get_tcp_session(ctx, ses->server); spin_lock(&ses->chan_lock); chan = &ses->chans[ses->chan_count]; @@ -470,20 +635,16 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, rc = cifs_negotiate_protocol(xid, ses, chan->server); if (!rc) - rc = cifs_setup_session(xid, ses, chan->server, cifs_sb->local_nls); + rc = cifs_setup_session(xid, ses, chan->server, ses->local_nls); mutex_unlock(&ses->session_mutex); out: if (rc && chan->server) { - /* - * we should avoid race with these delayed works before we - * remove this channel - */ - cancel_delayed_work_sync(&chan->server->echo); - cancel_delayed_work_sync(&chan->server->reconnect); + cifs_put_tcp_session(chan->server, 0); spin_lock(&ses->chan_lock); + /* we rely on all bits beyond chan_count to be clear */ cifs_chan_clear_need_reconnect(ses, chan->server); ses->chan_count--; @@ -493,10 +654,12 @@ out: */ WARN_ON(ses->chan_count < 1); spin_unlock(&ses->chan_lock); - - cifs_put_tcp_session(chan->server, 0); } + kfree(ctx->UNC); +out_free_ctx: + kfree(ctx); +out_free_xid: free_xid(xid); return rc; } @@ -522,8 +685,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ - /* BB verify whether signing required on neg or just on auth frame - (and NTLM case) */ + /* BB verify whether signing required on neg or just auth frame (and NTLM case) */ capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; @@ -580,8 +742,10 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, /* copy domain */ if (ses->domainName == NULL) { - /* Sending null domain better than using a bogus domain name (as - we did briefly in 2.6.18) since server will use its default */ + /* + * Sending null domain better than using a bogus domain name (as + * we did briefly in 2.6.18) since server will use its default + */ *bcc_ptr = 0; *(bcc_ptr+1) = 0; bytes_ret = 0; @@ -600,8 +764,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, char *bcc_ptr = *pbcc_area; int bytes_ret = 0; - /* BB FIXME add check that strings total less - than 335 or will need to send them as arrays */ + /* BB FIXME add check that strings less than 335 or will need to send as arrays */ /* copy user */ if (ses->user_name == NULL) { @@ -646,8 +809,7 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, if (WARN_ON_ONCE(len < 0)) len = CIFS_MAX_DOMAINNAME_LEN - 1; bcc_ptr += len; - } /* else we will send a null domain name - so the server will default to its own domain */ + } /* else we send a null domain name so server will default to its own domain */ *bcc_ptr = 0; bcc_ptr++; @@ -743,11 +905,14 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, if (len > bleft) return; - /* No domain field in LANMAN case. Domain is - returned by old servers in the SMB negprot response */ - /* BB For newer servers which do not support Unicode, - but thus do return domain here we could add parsing - for it later, but it is not very important */ + /* + * No domain field in LANMAN case. Domain is + * returned by old servers in the SMB negprot response + * + * BB For newer servers which do not support Unicode, + * but thus do return domain here, we could add parsing + * for it later, but it is not very important + */ cifs_dbg(FYI, "ascii: bytes left %d\n", bleft); } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ @@ -803,9 +968,12 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, ses->ntlmssp->server_flags = server_flags; memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); - /* In particular we can examine sign flags */ - /* BB spec says that if AvId field of MsvAvTimestamp is populated then - we must set the MIC field of the AUTHENTICATE_MESSAGE */ + /* + * In particular we can examine sign flags + * + * BB spec says that if AvId field of MsvAvTimestamp is populated then + * we must set the MIC field of the AUTHENTICATE_MESSAGE + */ tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); tilen = le16_to_cpu(pblob->TargetInfoArray.Length); @@ -1046,10 +1214,16 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer, memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; + /* send version information in ntlmssp authenticate also */ flags = ses->ntlmssp->server_flags | NTLMSSP_REQUEST_TARGET | - NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED; - /* we only send version information in ntlmssp negotiate, so do not set this flag */ - flags = flags & ~NTLMSSP_NEGOTIATE_VERSION; + NTLMSSP_NEGOTIATE_TARGET_INFO | NTLMSSP_NEGOTIATE_VERSION | + NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED; + + sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR; + sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL; + sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD); + sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3; + tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 7d1b3fc014d9..a9eaba8083b0 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -542,14 +542,17 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink) +static int cifs_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data) { int rc; FILE_ALL_INFO fi = {}; - *symlink = false; + data->symlink = false; + data->adjust_tz = false; /* could do find first instead but this returns more info */ rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, @@ -562,7 +565,7 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); - *adjustTZ = true; + data->adjust_tz = true; } if (!rc) { @@ -589,7 +592,7 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* Need to check if this is a symbolic link or not */ tmprc = CIFS_open(xid, &oparms, &oplock, NULL); if (tmprc == -EOPNOTSUPP) - *symlink = true; + data->symlink = true; else if (tmprc == 0) CIFSSMBClose(xid, tcon, fid.netfid); } @@ -969,65 +972,41 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, #endif } -static int -cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - char **target_path, bool is_reparse_point) +static int cifs_query_symlink(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + char **target_path) { int rc; - int oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path); - if (is_reparse_point) { - cifs_dbg(VFS, "reparse points not handled for SMB1 symlinks\n"); + if (!cap_unix(tcon->ses)) return -EOPNOTSUPP; - } - - /* Check for unix extensions */ - if (cap_unix(tcon->ses)) { - rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); - if (rc == -EREMOTE) - rc = cifs_unix_dfs_readlink(xid, tcon, full_path, - target_path, - cifs_sb->local_nls); - goto out; - } - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = FILE_READ_ATTRIBUTES, - .create_options = cifs_create_options(cifs_sb, - OPEN_REPARSE_POINT), - .disposition = FILE_OPEN, - .path = full_path, - .fid = &fid, - }; - - rc = CIFS_open(xid, &oparms, &oplock, NULL); - if (rc) - goto out; - - rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path, - cifs_sb->local_nls); - if (rc) - goto out_close; - - convert_delimiter(*target_path, '/'); -out_close: - CIFSSMBClose(xid, tcon, fid.netfid); -out: - if (!rc) - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (rc == -EREMOTE) + rc = cifs_unix_dfs_readlink(xid, tcon, full_path, + target_path, cifs_sb->local_nls); return rc; } +static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, + struct kvec *rsp_iov, + struct cifs_open_info_data *data) +{ + struct reparse_data_buffer *buf; + TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; + bool unicode = !!(io->hdr.Flags2 & SMBFLG2_UNICODE); + u32 plen = le16_to_cpu(io->ByteCount); + + buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + + le32_to_cpu(io->DataOffset)); + return parse_reparse_point(buf, plen, cifs_sb, unicode, data); +} + static bool cifs_is_read_op(__u32 oplock) { @@ -1062,15 +1041,7 @@ cifs_make_node(unsigned int xid, struct inode *inode, { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *newinode = NULL; - int rc = -EPERM; - struct cifs_open_info_data buf = {}; - struct cifs_io_parms io_parms; - __u32 oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - unsigned int bytes_written; - struct win_dev *pdev; - struct kvec iov[2]; + int rc; if (tcon->unix_ext) { /* @@ -1104,74 +1075,18 @@ cifs_make_node(unsigned int xid, struct inode *inode, d_instantiate(dentry, newinode); return rc; } - /* - * SMB1 SFU emulation: should work with all servers, but only - * support block and char device (no socket & fifo) + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return rc; - - if (!S_ISCHR(mode) && !S_ISBLK(mode)) - return rc; - - cifs_dbg(FYI, "sfu compat create special file\n"); - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .cifs_sb = cifs_sb, - .desired_access = GENERIC_WRITE, - .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR | - CREATE_OPTION_SPECIAL), - .disposition = FILE_CREATE, - .path = full_path, - .fid = &fid, - }; - - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); - if (rc) - return rc; - - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - - pdev = (struct win_dev *)&buf.fi; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = &buf.fi; - iov[1].iov_len = sizeof(struct win_dev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(dev)); - pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(dentry); - - /* FIXME: add code here to set EAs */ - - cifs_free_open_info(&buf); - return rc; + return -EPERM; + return cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); } - - struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1208,6 +1123,7 @@ struct smb_version_operations smb1_operations = { .is_path_accessible = cifs_is_path_accessible, .can_echo = cifs_can_echo, .query_path_info = cifs_query_path_info, + .query_reparse_point = cifs_query_reparse_point, .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, .set_path_size = CIFSSMBSetEOF, @@ -1223,6 +1139,7 @@ struct smb_version_operations smb1_operations = { .rename = CIFSSMBRename, .create_hardlink = CIFSCreateHardLink, .query_symlink = cifs_query_symlink, + .parse_reparse_point = cifs_parse_reparse_point, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 8e696fbd72fa..c94940af5d4b 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -35,34 +35,22 @@ free_set_inf_compound(struct smb_rqst *rqst) SMB2_close_free(&rqst[2]); } - -struct cop_vars { - struct cifs_open_parms oparms; - struct kvec rsp_iov[3]; - struct smb_rqst rqst[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec qi_iov[1]; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; - struct kvec close_iov[1]; - struct smb2_file_rename_info rename_info; - struct smb2_file_link_info link_info; -}; - /* * note: If cfile is passed, the reference to it is dropped here. * So make sure that you do not reuse cfile after return from this func. * - * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all - * error responses. Caller is also responsible for freeing them up. + * If passing @out_iov and @out_buftype, ensure to make them both large enough + * (>= 3) to hold all compounded responses. Caller is also responsible for + * freeing them up with free_rsp_buf(). */ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile, __u8 **extbuf, size_t *extbuflen, - struct kvec *err_iov, int *err_buftype) + struct kvec *out_iov, int *out_buftype) { - struct cop_vars *vars = NULL; + struct smb2_compound_vars *vars = NULL; struct kvec *rsp_iov; struct smb_rqst *rqst; int rc; @@ -133,7 +121,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, /* Operation */ switch (command) { case SMB2_OP_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov[0]; + rqst[num_rqst].rq_iov = &vars->qi_iov; rqst[num_rqst].rq_nvec = 1; if (cfile) @@ -167,7 +155,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, full_path); break; case SMB2_OP_POSIX_QUERY_INFO: - rqst[num_rqst].rq_iov = &vars->qi_iov[0]; + rqst[num_rqst].rq_iov = &vars->qi_iov; rqst[num_rqst].rq_nvec = 1; if (cfile) @@ -375,7 +363,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto after_close; /* Close */ flags |= CIFS_CP_CREATE_CLOSE_OP; - rqst[num_rqst].rq_iov = &vars->close_iov[0]; + rqst[num_rqst].rq_iov = &vars->close_iov; rqst[num_rqst].rq_nvec = 1; rc = SMB2_close_init(tcon, server, &rqst[num_rqst], COMPOUND_FID, @@ -529,9 +517,9 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (cfile) cifsFileInfo_put(cfile); - if (rc && err_iov && err_buftype) { - memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov)); - memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype)); + if (out_iov && out_buftype) { + memcpy(out_iov, rsp_iov, 3 * sizeof(*out_iov)); + memcpy(out_buftype, resp_buftype, 3 * sizeof(*out_buftype)); } else { free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); @@ -541,20 +529,53 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, return rc; } -int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse) +static int parse_create_response(struct cifs_open_info_data *data, + struct cifs_sb_info *cifs_sb, + const struct kvec *iov) +{ + struct smb2_create_rsp *rsp = iov->iov_base; + bool reparse_point = false; + u32 tag = 0; + int rc = 0; + + switch (rsp->hdr.Status) { + case STATUS_IO_REPARSE_TAG_NOT_HANDLED: + reparse_point = true; + break; + case STATUS_STOPPED_ON_SYMLINK: + rc = smb2_parse_symlink_response(cifs_sb, iov, + &data->symlink_target); + if (rc) + return rc; + tag = IO_REPARSE_TAG_SYMLINK; + reparse_point = true; + break; + case STATUS_SUCCESS: + reparse_point = !!(rsp->Flags & SMB2_CREATE_FLAG_REPARSEPOINT); + break; + } + data->reparse_point = reparse_point; + data->reparse.tag = tag; + return rc; +} + +int smb2_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data) { __u32 create_options = 0; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; - struct kvec err_iov[3] = {}; - int err_buftype[3] = {}; + struct smb2_hdr *hdr; + struct kvec out_iov[3] = {}; + int out_buftype[3] = {}; bool islink; int rc, rc2; - *adjust_tz = false; - *reparse = false; + data->adjust_tz = false; + data->reparse_point = false; if (strcmp(full_path, "")) rc = -ENOENT; @@ -575,69 +596,73 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile, - NULL, NULL, err_iov, err_buftype); - if (rc) { - struct smb2_hdr *hdr = err_iov[0].iov_base; - - if (unlikely(!hdr || err_buftype[0] == CIFS_NO_BUFFER)) + NULL, NULL, out_iov, out_buftype); + hdr = out_iov[0].iov_base; + /* + * If first iov is unset, then SMB session was dropped or we've got a + * cached open file (@cfile). + */ + if (!hdr || out_buftype[0] == CIFS_NO_BUFFER) + goto out; + + switch (rc) { + case 0: + case -EOPNOTSUPP: + rc = parse_create_response(data, cifs_sb, &out_iov[0]); + if (rc || !data->reparse_point) goto out; - if (rc == -EOPNOTSUPP && hdr->Command == SMB2_CREATE && - hdr->Status == STATUS_STOPPED_ON_SYMLINK) { - rc = smb2_parse_symlink_response(cifs_sb, err_iov, - &data->symlink_target); - if (rc) - goto out; - - *reparse = true; - create_options |= OPEN_REPARSE_POINT; - - /* Failed on a symbolic link - query a reparse point info */ - cifs_get_readable_path(tcon, full_path, &cfile); - rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, ACL_NO_MODE, data, - SMB2_OP_QUERY_INFO, cfile, NULL, NULL, - NULL, NULL); + + create_options |= OPEN_REPARSE_POINT; + /* Failed on a symbolic link - query a reparse point info */ + cifs_get_readable_path(tcon, full_path, &cfile); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, ACL_NO_MODE, data, + SMB2_OP_QUERY_INFO, cfile, NULL, NULL, + NULL, NULL); + break; + case -EREMOTE: + break; + default: + if (hdr->Status != STATUS_OBJECT_NAME_INVALID) + break; + rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb, + full_path, &islink); + if (rc2) { + rc = rc2; goto out; - } else if (rc != -EREMOTE && hdr->Status == STATUS_OBJECT_NAME_INVALID) { - rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb, - full_path, &islink); - if (rc2) { - rc = rc2; - goto out; - } - if (islink) - rc = -EREMOTE; } + if (islink) + rc = -EREMOTE; } out: - free_rsp_buf(err_buftype[0], err_iov[0].iov_base); - free_rsp_buf(err_buftype[1], err_iov[1].iov_base); - free_rsp_buf(err_buftype[2], err_iov[2].iov_base); + free_rsp_buf(out_buftype[0], out_iov[0].iov_base); + free_rsp_buf(out_buftype[1], out_iov[1].iov_base); + free_rsp_buf(out_buftype[2], out_iov[2].iov_base); return rc; } - -int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, +int smb311_posix_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, struct cifs_open_info_data *data, struct cifs_sid *owner, - struct cifs_sid *group, - bool *adjust_tz, bool *reparse) + struct cifs_sid *group) { int rc; __u32 create_options = 0; struct cifsFileInfo *cfile; - struct kvec err_iov[3] = {}; - int err_buftype[3] = {}; + struct kvec out_iov[3] = {}; + int out_buftype[3] = {}; __u8 *sidsbuf = NULL; __u8 *sidsbuf_end = NULL; size_t sidsbuflen = 0; size_t owner_len, group_len; - *adjust_tz = false; - *reparse = false; + data->adjust_tz = false; + data->reparse_point = false; /* * BB TODO: Add support for using the cached root handle. @@ -649,27 +674,33 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, - &sidsbuf, &sidsbuflen, err_iov, err_buftype); - if (rc == -EOPNOTSUPP) { + &sidsbuf, &sidsbuflen, out_iov, out_buftype); + /* + * If first iov is unset, then SMB session was dropped or we've got a + * cached open file (@cfile). + */ + if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER) + goto out; + + switch (rc) { + case 0: + case -EOPNOTSUPP: /* BB TODO: When support for special files added to Samba re-verify this path */ - if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER && - ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE && - ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) { - rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target); - if (rc) - goto out; - } - *reparse = true; - create_options |= OPEN_REPARSE_POINT; + rc = parse_create_response(data, cifs_sb, &out_iov[0]); + if (rc || !data->reparse_point) + goto out; + create_options |= OPEN_REPARSE_POINT; /* Failed on a symbolic link - query a reparse point info */ cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile, &sidsbuf, &sidsbuflen, NULL, NULL); + break; } +out: if (rc == 0) { sidsbuf_end = sidsbuf + sidsbuflen; @@ -689,11 +720,10 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, memcpy(group, sidsbuf + owner_len, group_len); } -out: kfree(sidsbuf); - free_rsp_buf(err_buftype[0], err_iov[0].iov_base); - free_rsp_buf(err_buftype[1], err_iov[1].iov_base); - free_rsp_buf(err_buftype[2], err_iov[2].iov_base); + free_rsp_buf(out_buftype[0], out_iov[0].iov_base); + free_rsp_buf(out_buftype[1], out_iov[1].iov_base); + free_rsp_buf(out_buftype[2], out_iov[2].iov_base); return rc; } diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 194799ddd382..1a90dd78b238 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -877,8 +877,6 @@ static const struct status_to_posix_error smb2_error_map_table[] = { "STATUS_IO_REPARSE_TAG_MISMATCH"}, {STATUS_IO_REPARSE_DATA_INVALID, -EIO, "STATUS_IO_REPARSE_DATA_INVALID"}, - {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO, - "STATUS_IO_REPARSE_TAG_NOT_HANDLED"}, {STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO, "STATUS_REPARSE_POINT_NOT_RESOLVED"}, {STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO, diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 3935a60db5c3..82b84a4941dd 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -145,7 +145,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) __u64 mid; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* * Add function to do table lookup of StructureSize by command @@ -173,6 +173,21 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) } mid = le64_to_cpu(shdr->MessageId); + if (check_smb2_hdr(shdr, mid)) + return 1; + + if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { + cifs_dbg(VFS, "Invalid structure size %u\n", + le16_to_cpu(shdr->StructureSize)); + return 1; + } + + command = le16_to_cpu(shdr->Command); + if (command >= NUMBER_OF_SMB2_COMMANDS) { + cifs_dbg(VFS, "Invalid SMB2 command %d\n", command); + return 1; + } + if (len < pdu_size) { if ((len >= hdr_size) && (shdr->Status != 0)) { @@ -193,21 +208,6 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) return 1; } - if (check_smb2_hdr(shdr, mid)) - return 1; - - if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { - cifs_dbg(VFS, "Invalid structure size %u\n", - le16_to_cpu(shdr->StructureSize)); - return 1; - } - - command = le16_to_cpu(shdr->Command); - if (command >= NUMBER_OF_SMB2_COMMANDS) { - cifs_dbg(VFS, "Invalid SMB2 command %d\n", command); - return 1; - } - if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) { if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { @@ -313,6 +313,9 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { char * smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) { + const int max_off = 4096; + const int max_len = 128 * 1024; + *off = 0; *len = 0; @@ -384,29 +387,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) * Invalid length or offset probably means data area is invalid, but * we have little choice but to ignore the data area in this case. */ - if (*off > 4096) { - cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off); - *len = 0; + if (unlikely(*off < 0 || *off > max_off || + *len < 0 || *len > max_len)) { + cifs_dbg(VFS, "%s: invalid data area (off=%d len=%d)\n", + __func__, *off, *len); *off = 0; - } else if (*off < 0) { - cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n", - *off); - *off = 0; - *len = 0; - } else if (*len < 0) { - cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n", - *len); *len = 0; - } else if (*len > 128 * 1024) { - cifs_dbg(VFS, "data area larger than 128K: %d\n", *len); + } else if (*off == 0) { *len = 0; } /* return pointer to beginning of data area, ie offset from SMB start */ - if ((*off != 0) && (*len != 0)) + if (*off > 0 && *len > 0) return (char *)shdr + *off; - else - return NULL; + return NULL; } /* @@ -623,7 +617,7 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for lease break\n"); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); @@ -698,7 +692,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); @@ -787,7 +781,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid, { struct close_cancelled_open *cancelled; - cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC); + cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); if (!cancelled) return -ENOMEM; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 0f62bc373ad0..66b310208545 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -172,8 +172,17 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val) spin_lock(&server->req_lock); server->credits = val; - if (val == 1) + if (val == 1) { server->reconnect_instance++; + /* + * ChannelSequence updated for all channels in primary channel so that consistent + * across SMB3 requests sent on any channel. See MS-SMB2 3.2.4.1 and 3.2.7.1 + */ + if (SERVER_IS_CHAN(server)) + server->primary_server->channel_sequence_num++; + else + server->channel_sequence_num++; + } scredits = server->credits; in_flight = server->in_flight; spin_unlock(&server->req_lock); @@ -288,7 +297,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", credits->value, new_val); - return -ENOTSUPP; + return -EOPNOTSUPP; } spin_lock(&server->req_lock); @@ -394,8 +403,10 @@ smb2_dump_detail(void *buf, struct TCP_Server_Info *server) cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, shdr->Id.SyncId.ProcessId); - cifs_server_dbg(VFS, "smb buf %p len %u\n", buf, - server->ops->calc_smb_size(buf)); + if (!server->ops->check_message(buf, server->total_read, server)) { + cifs_server_dbg(VFS, "smb buf %p len %u\n", buf, + server->ops->calc_smb_size(buf)); + } #endif } @@ -747,6 +758,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_ unsigned int ret_data_len = 0; struct network_interface_info_ioctl_rsp *out_buf = NULL; struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *pserver; /* do not query too frequently */ if (ses->iface_last_update && @@ -771,6 +783,11 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_ if (rc) goto out; + /* check if iface is still active */ + pserver = ses->chans[0].server; + if (pserver && !cifs_chan_is_iface_active(ses, pserver)) + cifs_chan_update_iface(ses, pserver); + out: kfree(out_buf); return rc; @@ -1075,31 +1092,28 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, return rc; } - static int smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, const char *path, const char *ea_name, const void *ea_value, const __u16 ea_value_len, const struct nls_table *nls_codepage, struct cifs_sb_info *cifs_sb) { + struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); + struct smb_rqst *rqst; + struct kvec *rsp_iov; __le16 *utf16_path = NULL; int ea_name_len = strlen(ea_name); int flags = CIFS_CP_CREATE_CLOSE_OP; int len; - struct smb_rqst rqst[3]; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct cifs_open_parms oparms; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; unsigned int size[1]; void *data[1]; struct smb2_file_full_ea_info *ea = NULL; - struct kvec close_iov[1]; struct smb2_query_info_rsp *rsp; int rc, used_len = 0; @@ -1113,9 +1127,14 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; - memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); + vars = kzalloc(sizeof(*vars), GFP_KERNEL); + if (!vars) { + rc = -ENOMEM; + goto out_free_path; + } + rqst = vars->rqst; + rsp_iov = vars->rsp_iov; if (ses->server->ops->query_all_EAs) { if (!ea_value) { @@ -1150,7 +1169,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, /* Use a fudge factor of 256 bytes in case we collide * with a different set_EAs command. */ - if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + if (CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 < used_len + ea_name_len + ea_value_len + 1) { rc = -ENOSPC; @@ -1160,8 +1179,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, } /* Open */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; + rqst[0].rq_iov = vars->open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms = (struct cifs_open_parms) { @@ -1181,8 +1199,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, /* Set Info */ - memset(&si_iov, 0, sizeof(si_iov)); - rqst[1].rq_iov = si_iov; + rqst[1].rq_iov = vars->si_iov; rqst[1].rq_nvec = 1; len = sizeof(*ea) + ea_name_len + ea_value_len + 1; @@ -1210,10 +1227,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); - /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); @@ -1228,13 +1243,15 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, sea_exit: kfree(ea); - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_set_info_free(&rqst[1]); SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); +out_free_path: + kfree(utf16_path); return rc; } #endif @@ -1394,11 +1411,14 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, /* Creation time should not need to be updated on close */ if (file_inf.LastWriteTime) - inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime); + inode_set_mtime_to_ts(inode, + cifs_NTtimeToUnix(file_inf.LastWriteTime)); if (file_inf.ChangeTime) - inode->i_ctime = cifs_NTtimeToUnix(file_inf.ChangeTime); + inode_set_ctime_to_ts(inode, + cifs_NTtimeToUnix(file_inf.ChangeTime)); if (file_inf.LastAccessTime) - inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime); + inode_set_atime_to_ts(inode, + cifs_NTtimeToUnix(file_inf.LastAccessTime)); /* * i_blocks is not related to (i_size / i_blksize), @@ -1445,16 +1465,6 @@ req_res_key_exit: return rc; } -struct iqi_vars { - struct smb_rqst rqst[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec qi_iov[1]; - struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; - struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; - struct kvec close_iov[1]; -}; - static int smb2_ioctl_query_info(const unsigned int xid, struct cifs_tcon *tcon, @@ -1462,7 +1472,7 @@ smb2_ioctl_query_info(const unsigned int xid, __le16 *path, int is_dir, unsigned long p) { - struct iqi_vars *vars; + struct smb2_compound_vars *vars; struct smb_rqst *rqst; struct kvec *rsp_iov; struct cifs_ses *ses = tcon->ses; @@ -1580,7 +1590,7 @@ smb2_ioctl_query_info(const unsigned int xid, rc = -EINVAL; goto free_open_req; } - rqst[1].rq_iov = &vars->si_iov[0]; + rqst[1].rq_iov = vars->si_iov; rqst[1].rq_nvec = 1; /* MS-FSCC 2.4.13 FileEndOfFileInformation */ @@ -1592,7 +1602,7 @@ smb2_ioctl_query_info(const unsigned int xid, SMB2_O_INFO_FILE, 0, data, size); free_req1_func = SMB2_set_info_free; } else if (qi.flags == PASSTHRU_QUERY_INFO) { - rqst[1].rq_iov = &vars->qi_iov[0]; + rqst[1].rq_iov = &vars->qi_iov; rqst[1].rq_nvec = 1; rc = SMB2_query_info_init(tcon, server, @@ -1614,7 +1624,7 @@ smb2_ioctl_query_info(const unsigned int xid, smb2_set_related(&rqst[1]); /* Close */ - rqst[2].rq_iov = &vars->close_iov[0]; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, @@ -2407,7 +2417,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) return false; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { @@ -2523,15 +2533,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb) { + struct smb2_compound_vars *vars; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = cifs_pick_channel(ses); int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst rqst[3]; + struct smb_rqst *rqst; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec qi_iov[1]; - struct kvec close_iov[1]; + struct kvec *rsp_iov; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; @@ -2548,9 +2556,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); + vars = kzalloc(sizeof(*vars), GFP_KERNEL); + if (!vars) { + rc = -ENOMEM; + goto out_free_path; + } + rqst = vars->rqst; + rsp_iov = vars->rsp_iov; /* * We can only call this for things we know are directories. @@ -2559,8 +2572,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, open_cached_dir(xid, tcon, path, cifs_sb, false, &cfid); /* cfid null if open dir failed */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; + rqst[0].rq_iov = vars->open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms = (struct cifs_open_parms) { @@ -2578,8 +2590,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, goto qic_exit; smb2_set_next_command(tcon, &rqst[0]); - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[1].rq_iov = qi_iov; + rqst[1].rq_iov = &vars->qi_iov; rqst[1].rq_nvec = 1; if (cfid) { @@ -2606,8 +2617,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); } - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, @@ -2638,7 +2648,6 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, *buftype = resp_buftype[1]; qic_exit: - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_query_info_free(&rqst[1]); SMB2_close_free(&rqst[2]); @@ -2646,6 +2655,9 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); if (cfid) close_cached_dir(cfid); + kfree(vars); +out_free_path: + kfree(utf16_path); return rc; } @@ -2681,6 +2693,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, smb2_copy_fs_info_to_kstatfs(info, buf); qfs_exit: + trace_smb3_qfs_done(xid, tcon->tid, tcon->ses->Suid, tcon->tree_name, rc); free_rsp_buf(buftype, rsp_iov.iov_base); return rc; } @@ -2825,6 +2838,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, usleep_range(512, 2048); } while (++retry_count < 5); + if (!rc && !dfs_rsp) + rc = -EIO; if (rc) { if (!is_retryable_error(rc) && rc != -ENOENT && rc != -EOPNOTSUPP) cifs_tcon_dbg(VFS, "%s: ioctl error: rc=%d\n", __func__, rc); @@ -2855,247 +2870,129 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, return rc; } -static int -parse_reparse_posix(struct reparse_posix_data *symlink_buf, - u32 plen, char **target_path, - struct cifs_sb_info *cifs_sb) +/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ +static int parse_reparse_posix(struct reparse_posix_data *buf, + struct cifs_sb_info *cifs_sb, + struct cifs_open_info_data *data) { unsigned int len; - - /* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */ - len = le16_to_cpu(symlink_buf->ReparseDataLength); - - if (le64_to_cpu(symlink_buf->InodeType) != NFS_SPECFILE_LNK) { - cifs_dbg(VFS, "%lld not a supported symlink type\n", - le64_to_cpu(symlink_buf->InodeType)); + u64 type; + + switch ((type = le64_to_cpu(buf->InodeType))) { + case NFS_SPECFILE_LNK: + len = le16_to_cpu(buf->ReparseDataLength); + data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer, + len, true, + cifs_sb->local_nls); + if (!data->symlink_target) + return -ENOMEM; + convert_delimiter(data->symlink_target, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", + __func__, data->symlink_target); + break; + case NFS_SPECFILE_CHR: + case NFS_SPECFILE_BLK: + case NFS_SPECFILE_FIFO: + case NFS_SPECFILE_SOCK: + break; + default: + cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n", + __func__, type); return -EOPNOTSUPP; } - - *target_path = cifs_strndup_from_utf16( - symlink_buf->PathBuffer, - len, true, cifs_sb->local_nls); - if (!(*target_path)) - return -ENOMEM; - - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); - return 0; } -static int -parse_reparse_symlink(struct reparse_symlink_data_buffer *symlink_buf, - u32 plen, char **target_path, - struct cifs_sb_info *cifs_sb) +static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym, + u32 plen, bool unicode, + struct cifs_sb_info *cifs_sb, + struct cifs_open_info_data *data) { - unsigned int sub_len; - unsigned int sub_offset; + unsigned int len; + unsigned int offs; /* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */ - sub_offset = le16_to_cpu(symlink_buf->SubstituteNameOffset); - sub_len = le16_to_cpu(symlink_buf->SubstituteNameLength); - if (sub_offset + 20 > plen || - sub_offset + sub_len + 20 > plen) { + offs = le16_to_cpu(sym->SubstituteNameOffset); + len = le16_to_cpu(sym->SubstituteNameLength); + if (offs + 20 > plen || offs + len + 20 > plen) { cifs_dbg(VFS, "srv returned malformed symlink buffer\n"); return -EIO; } - *target_path = cifs_strndup_from_utf16( - symlink_buf->PathBuffer + sub_offset, - sub_len, true, cifs_sb->local_nls); - if (!(*target_path)) + data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs, + len, unicode, + cifs_sb->local_nls); + if (!data->symlink_target) return -ENOMEM; - convert_delimiter(*target_path, '/'); - cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + convert_delimiter(data->symlink_target, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target); return 0; } -static int -parse_reparse_point(struct reparse_data_buffer *buf, - u32 plen, char **target_path, - struct cifs_sb_info *cifs_sb) +int parse_reparse_point(struct reparse_data_buffer *buf, + u32 plen, struct cifs_sb_info *cifs_sb, + bool unicode, struct cifs_open_info_data *data) { - if (plen < sizeof(struct reparse_data_buffer)) { - cifs_dbg(VFS, "reparse buffer is too small. Must be at least 8 bytes but was %d\n", - plen); + if (plen < sizeof(*buf)) { + cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n", + __func__, plen); return -EIO; } - if (plen < le16_to_cpu(buf->ReparseDataLength) + - sizeof(struct reparse_data_buffer)) { - cifs_dbg(VFS, "srv returned invalid reparse buf length: %d\n", - plen); + if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) { + cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n", + __func__, plen); return -EIO; } + data->reparse.buf = buf; + /* See MS-FSCC 2.1.2 */ switch (le32_to_cpu(buf->ReparseTag)) { case IO_REPARSE_TAG_NFS: - return parse_reparse_posix( - (struct reparse_posix_data *)buf, - plen, target_path, cifs_sb); + return parse_reparse_posix((struct reparse_posix_data *)buf, + cifs_sb, data); case IO_REPARSE_TAG_SYMLINK: return parse_reparse_symlink( (struct reparse_symlink_data_buffer *)buf, - plen, target_path, cifs_sb); + plen, unicode, cifs_sb, data); + case IO_REPARSE_TAG_LX_SYMLINK: + case IO_REPARSE_TAG_AF_UNIX: + case IO_REPARSE_TAG_LX_FIFO: + case IO_REPARSE_TAG_LX_CHR: + case IO_REPARSE_TAG_LX_BLK: + return 0; default: - cifs_dbg(VFS, "srv returned unknown symlink buffer tag:0x%08x\n", - le32_to_cpu(buf->ReparseTag)); + cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n", + __func__, le32_to_cpu(buf->ReparseTag)); return -EOPNOTSUPP; } } -static int -smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - char **target_path, bool is_reparse_point) +static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, + struct kvec *rsp_iov, + struct cifs_open_info_data *data) { - int rc; - __le16 *utf16_path = NULL; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_open_parms oparms; - struct cifs_fid fid; - struct kvec err_iov = {NULL, 0}; - struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); - int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst rqst[3]; - int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; - struct kvec close_iov[1]; - struct smb2_create_rsp *create_rsp; - struct smb2_ioctl_rsp *ioctl_rsp; - struct reparse_data_buffer *reparse_buf; - int create_options = is_reparse_point ? OPEN_REPARSE_POINT : 0; - u32 plen; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); - - *target_path = NULL; - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - memset(rqst, 0, sizeof(rqst)); - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); - - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - /* Open */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; - rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; - - oparms = (struct cifs_open_parms) { - .tcon = tcon, - .path = full_path, - .desired_access = FILE_READ_ATTRIBUTES, - .disposition = FILE_OPEN, - .create_options = cifs_create_options(cifs_sb, create_options), - .fid = &fid, - }; + struct reparse_data_buffer *buf; + struct smb2_ioctl_rsp *io = rsp_iov->iov_base; + u32 plen = le32_to_cpu(io->OutputCount); - rc = SMB2_open_init(tcon, server, - &rqst[0], &oplock, &oparms, utf16_path); - if (rc) - goto querty_exit; - smb2_set_next_command(tcon, &rqst[0]); - - - /* IOCTL */ - memset(&io_iov, 0, sizeof(io_iov)); - rqst[1].rq_iov = io_iov; - rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; - - rc = SMB2_ioctl_init(tcon, server, - &rqst[1], fid.persistent_fid, - fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0, - CIFSMaxBufSize - - MAX_SMB2_CREATE_RESPONSE_SIZE - - MAX_SMB2_CLOSE_RESPONSE_SIZE); - if (rc) - goto querty_exit; - - smb2_set_next_command(tcon, &rqst[1]); - smb2_set_related(&rqst[1]); - - - /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; - rqst[2].rq_nvec = 1; - - rc = SMB2_close_init(tcon, server, - &rqst[2], COMPOUND_FID, COMPOUND_FID, false); - if (rc) - goto querty_exit; - - smb2_set_related(&rqst[2]); - - rc = compound_send_recv(xid, tcon->ses, server, - flags, 3, rqst, - resp_buftype, rsp_iov); - - create_rsp = rsp_iov[0].iov_base; - if (create_rsp && create_rsp->hdr.Status) - err_iov = rsp_iov[0]; - ioctl_rsp = rsp_iov[1].iov_base; - - /* - * Open was successful and we got an ioctl response. - */ - if ((rc == 0) && (is_reparse_point)) { - /* See MS-FSCC 2.3.23 */ - - reparse_buf = (struct reparse_data_buffer *) - ((char *)ioctl_rsp + - le32_to_cpu(ioctl_rsp->OutputOffset)); - plen = le32_to_cpu(ioctl_rsp->OutputCount); - - if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) > - rsp_iov[1].iov_len) { - cifs_tcon_dbg(VFS, "srv returned invalid ioctl len: %d\n", - plen); - rc = -EIO; - goto querty_exit; - } - - rc = parse_reparse_point(reparse_buf, plen, target_path, - cifs_sb); - goto querty_exit; - } - - if (!rc || !err_iov.iov_base) { - rc = -ENOENT; - goto querty_exit; - } - - rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path); - - querty_exit: - cifs_dbg(FYI, "query symlink rc %d\n", rc); - kfree(utf16_path); - SMB2_open_free(&rqst[0]); - SMB2_ioctl_free(&rqst[1]); - SMB2_close_free(&rqst[2]); - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); - return rc; + buf = (struct reparse_data_buffer *)((u8 *)io + + le32_to_cpu(io->OutputOffset)); + return parse_reparse_point(buf, plen, cifs_sb, true, data); } -int -smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 *tag) +static int smb2_query_reparse_point(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + u32 *tag, struct kvec *rsp, + int *rsp_buftype) { + struct smb2_compound_vars *vars; int rc; __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; @@ -3103,35 +3000,36 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); int flags = CIFS_CP_CREATE_CLOSE_OP; - struct smb_rqst rqst[3]; + struct smb_rqst *rqst; int resp_buftype[3]; - struct kvec rsp_iov[3]; - struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; - struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; - struct kvec close_iov[1]; + struct kvec *rsp_iov; struct smb2_ioctl_rsp *ioctl_rsp; struct reparse_data_buffer *reparse_buf; - u32 plen; + u32 off, count, len; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - memset(rqst, 0, sizeof(rqst)); - resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; - memset(rsp_iov, 0, sizeof(rsp_iov)); - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); if (!utf16_path) return -ENOMEM; + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + vars = kzalloc(sizeof(*vars), GFP_KERNEL); + if (!vars) { + rc = -ENOMEM; + goto out_free_path; + } + rqst = vars->rqst; + rsp_iov = vars->rsp_iov; + /* * setup smb2open - TODO add optimization to call cifs_get_readable_path * to see if there is a handle already open that we can use */ - memset(&open_iov, 0, sizeof(open_iov)); - rqst[0].rq_iov = open_iov; + rqst[0].rq_iov = vars->open_iov; rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms = (struct cifs_open_parms) { @@ -3151,8 +3049,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, /* IOCTL */ - memset(&io_iov, 0, sizeof(io_iov)); - rqst[1].rq_iov = io_iov; + rqst[1].rq_iov = vars->io_iov; rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; rc = SMB2_ioctl_init(tcon, server, @@ -3167,10 +3064,8 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); - /* Close */ - memset(&close_iov, 0, sizeof(close_iov)); - rqst[2].rq_iov = close_iov; + rqst[2].rq_iov = &vars->close_iov; rqst[2].rq_nvec = 1; rc = SMB2_close_init(tcon, server, @@ -3191,30 +3086,41 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, */ if (rc == 0) { /* See MS-FSCC 2.3.23 */ + off = le32_to_cpu(ioctl_rsp->OutputOffset); + count = le32_to_cpu(ioctl_rsp->OutputCount); + if (check_add_overflow(off, count, &len) || + len > rsp_iov[1].iov_len) { + cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n", + __func__, off, count); + rc = -EIO; + goto query_rp_exit; + } - reparse_buf = (struct reparse_data_buffer *) - ((char *)ioctl_rsp + - le32_to_cpu(ioctl_rsp->OutputOffset)); - plen = le32_to_cpu(ioctl_rsp->OutputCount); - - if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) > - rsp_iov[1].iov_len) { - cifs_tcon_dbg(FYI, "srv returned invalid ioctl len: %d\n", - plen); + reparse_buf = (void *)((u8 *)ioctl_rsp + off); + len = sizeof(*reparse_buf); + if (count < len || + count < le16_to_cpu(reparse_buf->ReparseDataLength) + len) { + cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n", + __func__, off, count); rc = -EIO; goto query_rp_exit; } *tag = le32_to_cpu(reparse_buf->ReparseTag); + *rsp = rsp_iov[1]; + *rsp_buftype = resp_buftype[1]; + resp_buftype[1] = CIFS_NO_BUFFER; } query_rp_exit: - kfree(utf16_path); SMB2_open_free(&rqst[0]); SMB2_ioctl_free(&rqst[1]); SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); +out_free_path: + kfree(utf16_path); return rc; } @@ -3415,6 +3321,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, struct inode *inode = file_inode(file); struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsFileInfo *cfile = file->private_data; + unsigned long long new_size; long rc; unsigned int xid; __le64 eof; @@ -3445,10 +3352,15 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, /* * do we also need to change the size of the file? */ - if (keep_size == false && i_size_read(inode) < offset + len) { - eof = cpu_to_le64(offset + len); + new_size = offset + len; + if (keep_size == false && (unsigned long long)i_size_read(inode) < new_size) { + eof = cpu_to_le64(new_size); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc >= 0) { + truncate_setsize(inode, new_size); + fscache_resize_cookie(cifs_inode_cookie(inode), new_size); + } } zero_range_exit: @@ -3843,6 +3755,9 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, if (rc < 0) goto out_2; + truncate_setsize(inode, old_eof + len); + fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode)); + rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); if (rc < 0) goto out_2; @@ -4400,7 +4315,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) u8 *ses_enc_key; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { @@ -4707,7 +4622,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (shdr->Command != SMB2_READ) { cifs_server_dbg(VFS, "only big read responses are supported\n"); - return -ENOTSUPP; + return -EOPNOTSUPP; } if (server->ops->is_session_expired && @@ -5036,6 +4951,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, struct smb2_hdr *shdr; unsigned int pdu_length = server->pdu_size; unsigned int buf_size; + unsigned int next_cmd; struct mid_q_entry *mid_entry; int next_is_large; char *next_buffer = NULL; @@ -5064,14 +4980,15 @@ receive_encrypted_standard(struct TCP_Server_Info *server, next_is_large = server->large_buf; one_more: shdr = (struct smb2_hdr *)buf; - if (shdr->NextCommand) { + next_cmd = le32_to_cpu(shdr->NextCommand); + if (next_cmd) { + if (WARN_ON_ONCE(next_cmd > pdu_length)) + return -1; if (next_is_large) next_buffer = (char *)cifs_buf_get(); else next_buffer = (char *)cifs_small_buf_get(); - memcpy(next_buffer, - buf + le32_to_cpu(shdr->NextCommand), - pdu_length - le32_to_cpu(shdr->NextCommand)); + memcpy(next_buffer, buf + next_cmd, pdu_length - next_cmd); } mid_entry = smb2_find_mid(server, buf); @@ -5095,8 +5012,8 @@ one_more: else ret = cifs_handle_standard(server, mid_entry); - if (ret == 0 && shdr->NextCommand) { - pdu_length -= le32_to_cpu(shdr->NextCommand); + if (ret == 0 && next_cmd) { + pdu_length -= next_cmd; server->large_buf = next_is_large; if (next_is_large) server->bigbuf = buf = next_buffer; @@ -5159,54 +5076,42 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) NULL, 0, false); } -static int -smb2_next_header(char *buf) +static int smb2_next_header(struct TCP_Server_Info *server, char *buf, + unsigned int *noff) { struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf; - if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) - return sizeof(struct smb2_transform_hdr) + - le32_to_cpu(t_hdr->OriginalMessageSize); - - return le32_to_cpu(hdr->NextCommand); + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + *noff = le32_to_cpu(t_hdr->OriginalMessageSize); + if (unlikely(check_add_overflow(*noff, sizeof(*t_hdr), noff))) + return -EINVAL; + } else { + *noff = le32_to_cpu(hdr->NextCommand); + } + if (unlikely(*noff && *noff < MID_HEADER_SIZE(server))) + return -EINVAL; + return 0; } -static int -smb2_make_node(unsigned int xid, struct inode *inode, - struct dentry *dentry, struct cifs_tcon *tcon, - const char *full_path, umode_t mode, dev_t dev) +int cifs_sfu_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) { - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - int rc = -EPERM; struct cifs_open_info_data buf = {}; - struct cifs_io_parms io_parms = {0}; - __u32 oplock = 0; - struct cifs_fid fid; + struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; + struct cifs_io_parms io_parms = {}; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_fid fid; unsigned int bytes_written; struct win_dev *pdev; struct kvec iov[2]; + __u32 oplock = server->oplocks ? REQ_OPLOCK : 0; + int rc; - /* - * Check if mounted with mount parm 'sfu' mount parm. - * SFU emulation should work with all servers, but only - * supports block and char device (no socket & fifo), - * and was used by default in earlier versions of Windows - */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - return rc; - - /* - * TODO: Add ability to create instead via reparse point. Windows (e.g. - * their current NFS server) uses this approach to expose special files - * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions - */ - - if (!S_ISCHR(mode) && !S_ISBLK(mode)) - return rc; - - cifs_dbg(FYI, "sfu compat create special file\n"); + if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode)) + return -EPERM; oparms = (struct cifs_open_parms) { .tcon = tcon, @@ -5219,11 +5124,7 @@ smb2_make_node(unsigned int xid, struct inode *inode, .fid = &fid, }; - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf); + rc = server->ops->open(xid, &oparms, &oplock, &buf); if (rc) return rc; @@ -5231,36 +5132,56 @@ smb2_make_node(unsigned int xid, struct inode *inode, * BB Do not bother to decode buf since no local inode yet to put * timestamps in, but we can reuse it safely. */ - pdev = (struct win_dev *)&buf.fi; io_parms.pid = current->tgid; io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = &buf.fi; - iov[1].iov_len = sizeof(struct win_dev); + io_parms.length = sizeof(*pdev); + iov[1].iov_base = pdev; + iov[1].iov_len = sizeof(*pdev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); pdev->major = cpu_to_le64(MAJOR(dev)); pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); } else if (S_ISBLK(mode)) { memcpy(pdev->type, "IntxBLK", 8); pdev->major = cpu_to_le64(MAJOR(dev)); pdev->minor = cpu_to_le64(MINOR(dev)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); + } else if (S_ISFIFO(mode)) { + memcpy(pdev->type, "LnxFIFO", 8); } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(dentry); + rc = server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + server->ops->close(xid, tcon, &fid); + d_drop(dentry); /* FIXME: add code here to set EAs */ - cifs_free_open_info(&buf); return rc; } +static int smb2_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + const char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + return -EPERM; + /* + * TODO: Add ability to create instead via reparse point. Windows (e.g. + * their current NFS server) uses this approach to expose special files + * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions + */ + return cifs_sfu_make_node(xid, inode, dentry, tcon, + full_path, mode, dev); +} + #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, @@ -5298,6 +5219,7 @@ struct smb_version_operations smb20_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5310,7 +5232,7 @@ struct smb_version_operations smb20_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, @@ -5399,6 +5321,7 @@ struct smb_version_operations smb21_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5411,7 +5334,7 @@ struct smb_version_operations smb21_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, @@ -5503,7 +5426,7 @@ struct smb_version_operations smb30_operations = { .echo = SMB2_echo, .query_path_info = smb2_query_path_info, /* WSL tags introduced long after smb2.1, enable for SMB3, 3.11 only */ - .query_reparse_tag = smb2_query_reparse_tag, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5516,7 +5439,7 @@ struct smb_version_operations smb30_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, @@ -5616,7 +5539,7 @@ struct smb_version_operations smb311_operations = { .can_echo = smb2_can_echo, .echo = SMB2_echo, .query_path_info = smb2_query_path_info, - .query_reparse_tag = smb2_query_reparse_tag, + .query_reparse_point = smb2_query_reparse_point, .get_srv_inum = smb2_get_srv_inum, .query_file_info = smb2_query_file_info, .set_path_size = smb2_set_path_size, @@ -5630,7 +5553,7 @@ struct smb_version_operations smb311_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .query_symlink = smb2_query_symlink, + .parse_reparse_point = smb2_parse_reparse_point, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index a457f07f820d..4f971c1061f0 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -88,10 +88,27 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, const struct cifs_tcon *tcon, struct TCP_Server_Info *server) { + struct smb3_hdr_req *smb3_hdr; + shdr->ProtocolId = SMB2_PROTO_NUMBER; shdr->StructureSize = cpu_to_le16(64); shdr->Command = smb2_cmd; + if (server) { + /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */ + if (server->dialect >= SMB30_PROT_ID) { + smb3_hdr = (struct smb3_hdr_req *)shdr; + /* + * if primary channel is not set yet, use default + * channel for chan sequence num + */ + if (SERVER_IS_CHAN(server)) + smb3_hdr->ChannelSequence = + cpu_to_le16(server->primary_server->channel_sequence_num); + else + smb3_hdr->ChannelSequence = + cpu_to_le16(server->channel_sequence_num); + } spin_lock(&server->req_lock); /* Request up to 10 credits but don't go over the limit. */ if (server->credits >= server->max_credits) @@ -141,11 +158,14 @@ out: static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, - struct TCP_Server_Info *server) + struct TCP_Server_Info *server, bool from_reconnect) { int rc = 0; struct nls_table *nls_codepage = NULL; struct cifs_ses *ses; + int xid; + struct TCP_Server_Info *pserver; + unsigned int chan_index; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -206,6 +226,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, return -EAGAIN; } } + + /* if server is marked for termination, cifsd will cleanup */ + if (server->terminate) { + spin_unlock(&server->srv_lock); + return -EHOSTDOWN; + } spin_unlock(&server->srv_lock); again: @@ -225,11 +251,23 @@ again: mutex_lock(&ses->session_mutex); /* + * if this is called by delayed work, and the channel has been disabled + * in parallel, the delayed work can continue to execute in parallel + * there's a chance that this channel may not exist anymore + */ + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->srv_lock); + mutex_unlock(&ses->session_mutex); + rc = -EHOSTDOWN; + goto out; + } + + /* * Recheck after acquire mutex. If another thread is negotiating * and the server never sends an answer the socket will be closed * and tcpStatus set to reconnect. */ - spin_lock(&server->srv_lock); if (server->tcpStatus == CifsNeedReconnect) { spin_unlock(&server->srv_lock); mutex_unlock(&ses->session_mutex); @@ -266,6 +304,53 @@ again: rc = cifs_negotiate_protocol(0, ses, server); if (!rc) { + /* + * if server stopped supporting multichannel + * and the first channel reconnected, disable all the others. + */ + if (ses->chan_count > 1 && + !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + if (SERVER_IS_CHAN(server)) { + cifs_dbg(VFS, "server %s does not support " \ + "multichannel anymore. skipping secondary channel\n", + ses->server->hostname); + + spin_lock(&ses->chan_lock); + chan_index = cifs_ses_get_chan_index(ses, server); + if (chan_index == CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + goto skip_terminate; + } + + ses->chans[chan_index].server = NULL; + spin_unlock(&ses->chan_lock); + + /* + * the above reference of server by channel + * needs to be dropped without holding chan_lock + * as cifs_put_tcp_session takes a higher lock + * i.e. cifs_tcp_ses_lock + */ + cifs_put_tcp_session(server, from_reconnect); + + server->terminate = true; + cifs_signal_cifsd_for_reconnect(server, false); + + /* mark primary server as needing reconnect */ + pserver = server->primary_server; + cifs_signal_cifsd_for_reconnect(pserver, false); + +skip_terminate: + mutex_unlock(&ses->session_mutex); + rc = -EHOSTDOWN; + goto out; + } else { + cifs_server_dbg(VFS, "does not support " \ + "multichannel anymore. disabling all other channels\n"); + cifs_disable_secondary_channels(ses); + } + } + rc = cifs_setup_session(0, ses, server, nls_codepage); if ((rc == -EACCES) && !tcon->retry) { mutex_unlock(&ses->session_mutex); @@ -290,15 +375,41 @@ skip_sess_setup: tcon->need_reopen_files = true; rc = cifs_tree_connect(0, tcon, nls_codepage); - mutex_unlock(&ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { /* If sess reconnected but tcon didn't, something strange ... */ + mutex_unlock(&ses->session_mutex); cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); goto out; } + if (!rc && + (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + mutex_unlock(&ses->session_mutex); + + /* + * query server network interfaces, in case they change + */ + xid = get_xid(); + rc = SMB3_request_interfaces(xid, tcon, false); + free_xid(xid); + + if (rc) + cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", + __func__, rc); + + if (ses->chan_max > ses->chan_count && + !SERVER_IS_CHAN(server)) { + if (ses->chan_count == 1) + cifs_server_dbg(VFS, "supports multichannel now\n"); + + cifs_try_adding_channels(ses); + } + } else { + mutex_unlock(&ses->session_mutex); + } + if (smb2_command != SMB2_INTERNAL_CMD) mod_delayed_work(cifsiod_wq, &server->reconnect, 0); @@ -359,10 +470,15 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, void **request_buf, unsigned int *total_len) { /* BB eventually switch this to SMB2 specific small buf size */ - if (smb2_command == SMB2_SET_INFO) + switch (smb2_command) { + case SMB2_SET_INFO: + case SMB2_QUERY_INFO: *request_buf = cifs_buf_get(); - else + break; + default: *request_buf = cifs_small_buf_get(); + break; + } if (*request_buf == NULL) { /* BB should we add a retry in here if not a writepage? */ return -ENOMEM; @@ -387,7 +503,7 @@ static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, { int rc; - rc = smb2_reconnect(smb2_command, tcon, server); + rc = smb2_reconnect(smb2_command, tcon, server, false); if (rc) return rc; @@ -553,7 +669,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, * secondary channels don't have the hostname field populated * use the hostname field in the primary channel instead */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; cifs_server_lock(pserver); hostname = pserver->hostname; if (hostname && (hostname[0] != 0)) { @@ -831,7 +947,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) iov[num].iov_base = create_posix_buf(mode); if (mode == ACL_NO_MODE) - cifs_dbg(FYI, "Invalid mode\n"); + cifs_dbg(FYI, "%s: no mode\n", __func__); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2124,17 +2240,18 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info, posix->nlink, posix->mode, posix->reparse_tag); } -void -smb2_parse_contexts(struct TCP_Server_Info *server, - struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key, __u8 *oplock, - struct smb2_file_all_info *buf, - struct create_posix_rsp *posix) +int smb2_parse_contexts(struct TCP_Server_Info *server, + struct kvec *rsp_iov, + unsigned int *epoch, + char *lease_key, __u8 *oplock, + struct smb2_file_all_info *buf, + struct create_posix_rsp *posix) { - char *data_offset; + struct smb2_create_rsp *rsp = rsp_iov->iov_base; struct create_context *cc; - unsigned int next; - unsigned int remaining; + size_t rem, off, len; + size_t doff, dlen; + size_t noff, nlen; char *name; static const char smb3_create_tag_posix[] = { 0x93, 0xAD, 0x25, 0x50, 0x9C, @@ -2143,45 +2260,63 @@ smb2_parse_contexts(struct TCP_Server_Info *server, }; *oplock = 0; - data_offset = (char *)rsp + le32_to_cpu(rsp->CreateContextsOffset); - remaining = le32_to_cpu(rsp->CreateContextsLength); - cc = (struct create_context *)data_offset; + + off = le32_to_cpu(rsp->CreateContextsOffset); + rem = le32_to_cpu(rsp->CreateContextsLength); + if (check_add_overflow(off, rem, &len) || len > rsp_iov->iov_len) + return -EINVAL; + cc = (struct create_context *)((u8 *)rsp + off); /* Initialize inode number to 0 in case no valid data in qfid context */ if (buf) buf->IndexNumber = 0; - while (remaining >= sizeof(struct create_context)) { - name = le16_to_cpu(cc->NameOffset) + (char *)cc; - if (le16_to_cpu(cc->NameLength) == 4 && - strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4) == 0) - *oplock = server->ops->parse_lease_buf(cc, epoch, - lease_key); - else if (buf && (le16_to_cpu(cc->NameLength) == 4) && - strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4) == 0) - parse_query_id_ctxt(cc, buf); - else if ((le16_to_cpu(cc->NameLength) == 16)) { - if (posix && - memcmp(name, smb3_create_tag_posix, 16) == 0) + while (rem >= sizeof(*cc)) { + doff = le16_to_cpu(cc->DataOffset); + dlen = le32_to_cpu(cc->DataLength); + if (check_add_overflow(doff, dlen, &len) || len > rem) + return -EINVAL; + + noff = le16_to_cpu(cc->NameOffset); + nlen = le16_to_cpu(cc->NameLength); + if (noff + nlen >= doff) + return -EINVAL; + + name = (char *)cc + noff; + switch (nlen) { + case 4: + if (!strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) { + *oplock = server->ops->parse_lease_buf(cc, epoch, + lease_key); + } else if (buf && + !strncmp(name, SMB2_CREATE_QUERY_ON_DISK_ID, 4)) { + parse_query_id_ctxt(cc, buf); + } + break; + case 16: + if (posix && !memcmp(name, smb3_create_tag_posix, 16)) parse_posix_ctxt(cc, buf, posix); + break; + default: + cifs_dbg(FYI, "%s: unhandled context (nlen=%zu dlen=%zu)\n", + __func__, nlen, dlen); + if (IS_ENABLED(CONFIG_CIFS_DEBUG2)) + cifs_dump_mem("context data: ", cc, dlen); + break; } - /* else { - cifs_dbg(FYI, "Context not matched with len %d\n", - le16_to_cpu(cc->NameLength)); - cifs_dump_mem("Cctxt name: ", name, 4); - } */ - - next = le32_to_cpu(cc->Next); - if (!next) + + off = le32_to_cpu(cc->Next); + if (!off) break; - remaining -= next; - cc = (struct create_context *)((char *)cc + next); + if (check_sub_overflow(rem, off, &rem)) + return -EINVAL; + cc = (struct create_context *)((u8 *)cc + off); } if (rsp->OplockLevel != SMB2_OPLOCK_LEVEL_LEASE) *oplock = rsp->OplockLevel; - return; + return 0; } static int @@ -2223,7 +2358,7 @@ create_durable_v2_buf(struct cifs_open_parms *oparms) * (most servers default to 120 seconds) and most clients default to 0. * This can be overridden at mount ("handletimeout=") if the user wants * a different persistent (or resilient) handle timeout for all opens - * opens on a particular SMB3 mount. + * on a particular SMB3 mount. */ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); @@ -2368,7 +2503,7 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) return 0; } -/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ +/* See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ static void setup_owner_group_sids(char *buf) { struct owner_group_sids *sids = (struct owner_group_sids *)buf; @@ -2570,8 +2705,8 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, /* Do not append the separator if the path is empty */ if (path[0] != cpu_to_le16(0x0000)) { - UniStrcat(*out_path, sep); - UniStrcat(*out_path, path); + UniStrcat((wchar_t *)*out_path, (wchar_t *)sep); + UniStrcat((wchar_t *)*out_path, (wchar_t *)path); } unload_nls(cp); @@ -3012,8 +3147,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } - smb2_parse_contexts(server, rsp, &oparms->fid->epoch, - oparms->fid->lease_key, oplock, buf, posix); + rc = smb2_parse_contexts(server, &rsp_iov, &oparms->fid->epoch, + oparms->fid->lease_key, oplock, buf, posix); creat_exit: SMB2_open_free(&rqst); free_rsp_buf(resp_buftype, rsp); @@ -3113,6 +3248,7 @@ void SMB2_ioctl_free(struct smb_rqst *rqst) { int i; + if (rqst && rqst->rq_iov) { cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ for (i = 1; i < rqst->rq_nvec; i++) @@ -3359,12 +3495,10 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, } else { trace_smb3_close_done(xid, persistent_fid, tcon->tid, ses->Suid); - /* - * Note that have to subtract 4 since struct network_open_info - * has a final 4 byte pad that close response does not have - */ if (pbuf) - memcpy(pbuf, (char *)&rsp->CreationTime, sizeof(*pbuf) - 4); + memcpy(&pbuf->network_open_info, + &rsp->network_open_info, + sizeof(pbuf->network_open_info)); } atomic_dec(&tcon->num_remote_opens); @@ -3457,8 +3591,13 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, struct smb2_query_info_req *req; struct kvec *iov = rqst->rq_iov; unsigned int total_len; + size_t len; int rc; + if (unlikely(check_add_overflow(input_len, sizeof(*req), &len) || + len > CIFSMaxBufSize)) + return -EINVAL; + rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, server, (void **) &req, &total_len); if (rc) @@ -3480,7 +3619,7 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, iov[0].iov_base = (char *)req; /* 1 for Buffer */ - iov[0].iov_len = total_len - 1 + input_len; + iov[0].iov_len = len; return 0; } @@ -3488,7 +3627,7 @@ void SMB2_query_info_free(struct smb_rqst *rqst) { if (rqst && rqst->rq_iov) - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */ } static int @@ -3784,12 +3923,28 @@ void smb2_reconnect_server(struct work_struct *work) int rc; bool resched = false; + /* first check if ref count has reached 0, if not inc ref count */ + spin_lock(&cifs_tcp_ses_lock); + if (!server->srv_count) { + spin_unlock(&cifs_tcp_ses_lock); + return; + } + server->srv_count++; + spin_unlock(&cifs_tcp_ses_lock); + /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ mutex_lock(&pserver->reconnect_mutex); + /* if the server is marked for termination, drop the ref count here */ + if (server->terminate) { + cifs_put_tcp_session(server, true); + mutex_unlock(&pserver->reconnect_mutex); + return; + } + INIT_LIST_HEAD(&tmp_list); INIT_LIST_HEAD(&tmp_ses_list); cifs_dbg(FYI, "Reconnecting tcons and channels\n"); @@ -3834,17 +3989,10 @@ void smb2_reconnect_server(struct work_struct *work) } spin_unlock(&ses->chan_lock); } - /* - * Get the reference to server struct to be sure that the last call of - * cifs_put_tcon() in the loop below won't release the server pointer. - */ - if (tcon_exist || ses_exist) - server->srv_count++; - spin_unlock(&cifs_tcp_ses_lock); list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { - rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server); + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true); if (!rc) cifs_reopen_persistent_handles(tcon); else @@ -3860,7 +4008,7 @@ void smb2_reconnect_server(struct work_struct *work) goto done; /* allocate a dummy tcon struct used for reconnect */ - tcon = tconInfoAlloc(); + tcon = tcon_info_alloc(false); if (!tcon) { resched = true; list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { @@ -3877,7 +4025,7 @@ void smb2_reconnect_server(struct work_struct *work) /* now reconnect sessions for necessary channels */ list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { tcon->ses = ses; - rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server); + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true); if (rc) resched = true; list_del_init(&ses->rlist); @@ -3892,8 +4040,7 @@ done: mutex_unlock(&pserver->reconnect_mutex); /* now we can safely release srv struct */ - if (tcon_exist || ses_exist) - cifs_put_tcp_session(server, 1); + cifs_put_tcp_session(server, true); } int @@ -5355,6 +5502,11 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, return 0; } +static inline void free_qfs_info_req(struct kvec *iov) +{ + cifs_buf_release(iov->iov_base); +} + int SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) @@ -5386,7 +5538,7 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(iov.iov_base); + free_qfs_info_req(&iov); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); goto posix_qfsinf_exit; @@ -5437,7 +5589,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(iov.iov_base); + free_qfs_info_req(&iov); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); goto qfsinf_exit; @@ -5504,7 +5656,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(iov.iov_base); + free_qfs_info_req(&iov); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); goto qfsattr_exit; diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h index 220994d0a0f7..db08194484e0 100644 --- a/fs/smb/client/smb2pdu.h +++ b/fs/smb/client/smb2pdu.h @@ -319,13 +319,15 @@ struct smb2_file_reparse_point_info { } __packed; struct smb2_file_network_open_info { - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; - __le64 EndOfFile; - __le32 Attributes; + struct_group(network_open_info, + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndOfFile; + __le32 Attributes; + ); __le32 Reserved; } __packed; /* level 34 Query also similar returned in close rsp and open rsp */ diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index d5d7ffb7711c..0e371f7e2854 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -56,9 +56,11 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *path, __u32 *reparse_tag); -int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse); +int smb2_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, + struct cifs_open_info_data *data); extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, __u64 size, struct cifs_sb_info *cifs_sb, bool set_alloc); @@ -249,11 +251,13 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); -extern void smb2_parse_contexts(struct TCP_Server_Info *server, - struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key, - __u8 *oplock, struct smb2_file_all_info *buf, - struct create_posix_rsp *posix); +int smb2_parse_contexts(struct TCP_Server_Info *server, + struct kvec *rsp_iov, + unsigned int *epoch, + char *lease_key, __u8 *oplock, + struct smb2_file_all_info *buf, + struct create_posix_rsp *posix); + extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); @@ -275,12 +279,13 @@ extern int smb2_query_info_compound(const unsigned int xid, struct kvec *rsp, int *buftype, struct cifs_sb_info *cifs_sb); /* query path info from the server using SMB311 POSIX extensions*/ -int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, +int smb311_posix_query_path_info(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *full_path, struct cifs_open_info_data *data, struct cifs_sid *owner, - struct cifs_sid *group, - bool *adjust_tz, bool *reparse); + struct cifs_sid *group); int posix_info_parse(const void *beg, const void *end, struct smb2_posix_info_parsed *out); int posix_info_sid_size(const void *beg, const void *end); diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 7676091b3e77..5a3ca62d2f07 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -86,7 +86,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) spin_lock(&cifs_tcp_ses_lock); /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid == ses_id) @@ -149,7 +149,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) struct cifs_ses *ses; /* If server is a channel, select the primary channel */ - pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) @@ -413,7 +413,13 @@ generate_smb3signingkey(struct cifs_ses *ses, ses->ses_status == SES_GOOD); chan_index = cifs_ses_get_chan_index(ses, server); - /* TODO: introduce ref counting for channels when the can be freed */ + if (chan_index == CIFS_INVAL_CHAN_INDEX) { + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); + + return -EINVAL; + } + spin_unlock(&ses->chan_lock); spin_unlock(&ses->ses_lock); @@ -452,6 +458,8 @@ generate_smb3signingkey(struct cifs_ses *ses, ptriplet->encryption.context, ses->smb3encryptionkey, SMB3_ENC_DEC_KEY_SIZE); + if (rc) + return rc; rc = generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, ses->smb3decryptionkey, @@ -460,9 +468,6 @@ generate_smb3signingkey(struct cifs_ses *ses, return rc; } - if (rc) - return rc; - #ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS cifs_dbg(VFS, "%s: dumping generated AES session keys\n", __func__); /* diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 2a2aec8c6112..94df9eec3d8d 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1401,10 +1401,13 @@ create_conn: server->smbd_conn = smbd_get_connection( server, (struct sockaddr *) &server->dstaddr); - if (server->smbd_conn) + if (server->smbd_conn) { cifs_dbg(VFS, "RDMA transport re-established\n"); - - return server->smbd_conn ? 0 : -ENOENT; + trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); + return 0; + } + trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); + return -ENOENT; } static void destroy_caches_and_workqueue(struct smbd_connection *info) diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index e671bd16f00c..de199ec9f726 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -691,7 +691,7 @@ DEFINE_EVENT(smb3_tcon_class, smb3_##name, \ TP_ARGS(xid, tid, sesid, unc_name, rc)) DEFINE_SMB3_TCON_EVENT(tcon); - +DEFINE_SMB3_TCON_EVENT(qfs_done); /* * For smb2/smb3 open (including create and mkdir) calls @@ -935,6 +935,8 @@ DEFINE_EVENT(smb3_connect_class, smb3_##name, \ TP_ARGS(hostname, conn_id, addr)) DEFINE_SMB3_CONNECT_EVENT(connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_err); DECLARE_EVENT_CLASS(smb3_connect_err_class, TP_PROTO(char *hostname, __u64 conn_id, diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index f280502a2aee..4f717ad7c21b 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -18,7 +18,7 @@ #include <linux/bvec.h> #include <linux/highmem.h> #include <linux/uaccess.h> -#include <asm/processor.h> +#include <linux/processor.h> #include <linux/mempool.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> @@ -35,6 +35,8 @@ void cifs_wake_up_task(struct mid_q_entry *mid) { + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; wake_up_process(mid->callback_data); } @@ -74,7 +76,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) return temp; } -static void __release_mid(struct kref *refcount) +void __release_mid(struct kref *refcount) { struct mid_q_entry *midEntry = container_of(refcount, struct mid_q_entry, refcount); @@ -87,7 +89,8 @@ static void __release_mid(struct kref *refcount) struct TCP_Server_Info *server = midEntry->server; if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) && - midEntry->mid_state == MID_RESPONSE_RECEIVED && + (midEntry->mid_state == MID_RESPONSE_RECEIVED || + midEntry->mid_state == MID_RESPONSE_READY) && server->ops->handle_cancelled_mid) server->ops->handle_cancelled_mid(midEntry, server); @@ -153,15 +156,6 @@ static void __release_mid(struct kref *refcount) mempool_free(midEntry, cifs_mid_poolp); } -void release_mid(struct mid_q_entry *mid) -{ - struct TCP_Server_Info *server = mid->server; - - spin_lock(&server->mid_lock); - kref_put(&mid->refcount, __release_mid); - spin_unlock(&server->mid_lock); -} - void delete_mid(struct mid_q_entry *mid) { @@ -416,13 +410,19 @@ out: return rc; } +struct send_req_vars { + struct smb2_transform_hdr tr_hdr; + struct smb_rqst rqst[MAX_COMPOUND]; + struct kvec iov; +}; + static int smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst, int flags) { - struct kvec iov; - struct smb2_transform_hdr *tr_hdr; - struct smb_rqst cur_rqst[MAX_COMPOUND]; + struct send_req_vars *vars; + struct smb_rqst *cur_rqst; + struct kvec *iov; int rc; if (!(flags & CIFS_TRANSFORM_REQ)) @@ -436,16 +436,15 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, return -EIO; } - tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS); - if (!tr_hdr) + vars = kzalloc(sizeof(*vars), GFP_NOFS); + if (!vars) return -ENOMEM; + cur_rqst = vars->rqst; + iov = &vars->iov; - memset(&cur_rqst[0], 0, sizeof(cur_rqst)); - memset(&iov, 0, sizeof(iov)); - - iov.iov_base = tr_hdr; - iov.iov_len = sizeof(*tr_hdr); - cur_rqst[0].rq_iov = &iov; + iov->iov_base = &vars->tr_hdr; + iov->iov_len = sizeof(vars->tr_hdr); + cur_rqst[0].rq_iov = iov; cur_rqst[0].rq_nvec = 1; rc = server->ops->init_transform_rq(server, num_rqst + 1, @@ -456,7 +455,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]); smb3_free_compound_rqst(num_rqst, &cur_rqst[1]); out: - kfree(tr_hdr); + kfree(vars); return rc; } @@ -732,7 +731,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) int error; error = wait_event_state(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED, + midQ->mid_state != MID_REQUEST_SUBMITTED && + midQ->mid_state != MID_RESPONSE_RECEIVED, (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; @@ -885,7 +885,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) spin_lock(&server->mid_lock); switch (mid->mid_state) { - case MID_RESPONSE_RECEIVED: + case MID_RESPONSE_READY: spin_unlock(&server->mid_lock); return rc; case MID_RETRY_NEEDED: @@ -984,6 +984,9 @@ cifs_compound_callback(struct mid_q_entry *mid) credits.instance = server->reconnect_instance; add_credits(server, &credits, mid->optype); + + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; } static void @@ -1020,7 +1023,7 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { server = ses->chans[i].server; - if (!server) + if (!server || server->terminate) continue; /* @@ -1204,7 +1207,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, send_cancel(server, &rqst[i], midQ[i]); spin_lock(&server->mid_lock); midQ[i]->mid_flags |= MID_WAIT_CANCELLED; - if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED || + midQ[i]->mid_state == MID_RESPONSE_RECEIVED) { midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; @@ -1225,7 +1229,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } if (!midQ[i]->resp_buf || - midQ[i]->mid_state != MID_RESPONSE_RECEIVED) { + midQ[i]->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_dbg(FYI, "Bad MID state?\n"); goto out; @@ -1412,7 +1416,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc != 0) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1429,7 +1434,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, } if (!midQ->resp_buf || !out_buf || - midQ->mid_state != MID_RESPONSE_RECEIVED) { + midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_server_dbg(VFS, "Bad MID state?\n"); goto out; @@ -1553,14 +1558,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* Wait for a reply - allow signals to interrupt. */ rc = wait_event_interruptible(server->response_q, - (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) || + (!(midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED)) || ((server->tcpStatus != CifsGood) && (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ spin_lock(&server->srv_lock); if ((rc == -ERESTARTSYS) && - (midQ->mid_state == MID_REQUEST_SUBMITTED) && + (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) && ((server->tcpStatus == CifsGood) || (server->tcpStatus == CifsNew))) { spin_unlock(&server->srv_lock); @@ -1591,7 +1598,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1611,7 +1619,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return rc; /* rcvd frame is ok */ - if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { + if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_tcon_dbg(VFS, "Bad MID state?\n"); goto out; diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index 4ad5531686d8..6780aa3e98a1 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -150,10 +150,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto out; - if (pTcon->ses->server->ops->set_EA) + if (pTcon->ses->server->ops->set_EA) { rc = pTcon->ses->server->ops->set_EA(xid, pTcon, full_path, name, value, (__u16)size, cifs_sb->local_nls, cifs_sb); + if (rc == 0) + inode_set_ctime_current(inode); + } break; case XATTR_CIFS_ACL: @@ -478,7 +481,7 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = { .set = cifs_xattr_set, }; -const struct xattr_handler *cifs_xattr_handlers[] = { +const struct xattr_handler * const cifs_xattr_handlers[] = { &cifs_user_xattr_handler, &cifs_os2_xattr_handler, &cifs_cifs_acl_xattr_handler, diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index bae590eec871..57f2343164a3 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -34,6 +34,7 @@ #define SMB2_QUERY_INFO_HE 0x0010 #define SMB2_SET_INFO_HE 0x0011 #define SMB2_OPLOCK_BREAK_HE 0x0012 +#define SMB2_SERVER_TO_CLIENT_NOTIFICATION 0x0013 /* The same list in little endian */ #define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) @@ -153,6 +154,28 @@ struct smb2_hdr { __u8 Signature[16]; } __packed; +struct smb3_hdr_req { + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ + __le16 StructureSize; /* 64 */ + __le16 CreditCharge; /* MBZ */ + __le16 ChannelSequence; /* See MS-SMB2 3.2.4.1 and 3.2.7.1 */ + __le16 Reserved; + __le16 Command; + __le16 CreditRequest; /* CreditResponse */ + __le32 Flags; + __le32 NextCommand; + __le64 MessageId; + union { + struct { + __le32 ProcessId; + __le32 TreeId; + } __packed SyncId; + __le64 AsyncId; + } __packed Id; + __le64 SessionId; + __u8 Signature[16]; +} __packed; + struct smb2_pdu { struct smb2_hdr hdr; __le16 StructureSize2; /* size of wct area (varies, request specific) */ @@ -384,11 +407,12 @@ struct smb2_tree_disconnect_rsp { /* Capabilities flags */ #define SMB2_GLOBAL_CAP_DFS 0x00000001 #define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_LARGE_MTU 0x00000004 /* Resp only New to SMB2.1 */ #define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ #define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ #define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ #define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_NOTIFICATIONS 0x00000080 /* New to SMB3.1.1 */ /* Internal types */ #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 @@ -678,13 +702,16 @@ struct smb2_close_rsp { __le16 StructureSize; /* 60 */ __le16 Flags; __le32 Reserved; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; - __le32 Attributes; + struct_group(network_open_info, + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 AllocationSize; + __le64 EndOfFile; + __le32 Attributes; + ); } __packed; @@ -959,6 +986,19 @@ struct smb2_change_notify_rsp { __u8 Buffer[]; /* array of file notify structs */ } __packed; +/* + * SMB2_SERVER_TO_CLIENT_NOTIFICATION: See MS-SMB2 section 2.2.44 + */ + +#define SMB2_NOTIFY_SESSION_CLOSED 0x0000 + +struct smb2_server_client_notification { + struct smb2_hdr hdr; + __le16 StructureSize; + __u16 Reserved; /* MBZ */ + __le32 NotificationType; + __u8 NotificationBuffer[4]; /* MBZ */ +} __packed; /* * SMB2_CREATE See MS-SMB2 section 2.2.13 @@ -1075,16 +1115,23 @@ struct smb2_change_notify_rsp { #define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) #define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) #define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008) +/* FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) should be zero, ignored */ +/* FILE_SYNCHRONOUS_IO_NONALERT cpu_to_le32(0x00000020) should be zero, ignored */ #define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) #define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) #define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) +/* FILE_OPEN_REMOTE_INSTANCE cpu_to_le32(0x00000400) should be zero, ignored */ #define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) -#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) +#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) /* MBZ */ #define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) #define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) #define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) +/* FILE_OPEN_REQUIRING_OPLOCK cpu_to_le32(0x00010000) should be zero, ignored */ +/* FILE_DISALLOW_EXCLUSIVE cpu_to_le32(0x00020000) should be zero, ignored */ +/* FILE_RESERVE_OPFILTER cpu_to_le32(0x00100000) MBZ */ #define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) #define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) +/* #define FILE_OPEN_FOR_FREE_SPACE_QUERY cpu_to_le32(0x00800000) should be zero, ignored */ #define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF) #define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ @@ -1098,7 +1145,7 @@ struct smb2_change_notify_rsp { #define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ #define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" #define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" -#define SMB2_CREATE_ALLOCATION_SIZE "AISi" +#define SMB2_CREATE_ALLOCATION_SIZE "AlSi" #define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" #define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" #define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" @@ -1206,6 +1253,7 @@ struct create_mxac_rsp { #define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04) #define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02) +#define SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE cpu_to_le32(0x04) #define SMB2_LEASE_KEY_SIZE 16 diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index 7055cb5d2880..cabe6a843c6a 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -1,10 +1,11 @@ config SMB_SERVER - tristate "SMB3 server support (EXPERIMENTAL)" + tristate "SMB3 server support" depends on INET depends on MULTIUSER depends on FILE_LOCKING select NLS select NLS_UTF8 + select NLS_UCS2_UTILS select CRYPTO select CRYPTO_MD5 select CRYPTO_HMAC diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c index cc6384f79675..4a4b2b03ff33 100644 --- a/fs/smb/server/asn1.c +++ b/fs/smb/server/asn1.c @@ -214,12 +214,10 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen, { struct ksmbd_conn *conn = context; - conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL); + conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL); if (!conn->mechToken) return -ENOMEM; - memcpy(conn->mechToken, value, vlen); - conn->mechToken[vlen] = '\0'; return 0; } diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 5e5e120edcc2..229a6527870d 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -355,6 +355,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, if (blob_len < (u64)sess_key_off + sess_key_len) return -EINVAL; + if (sess_key_len > CIFS_KEY_SIZE) + return -EINVAL; + ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL); if (!ctx_arc4) return -ENOMEM; @@ -1029,11 +1032,15 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, { struct scatterlist *sg; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; - int i, nr_entries[3] = {0}, total_entries = 0, sg_idx = 0; + int i, *nr_entries, total_entries = 0, sg_idx = 0; if (!nvec) return NULL; + nr_entries = kcalloc(nvec, sizeof(int), GFP_KERNEL); + if (!nr_entries) + return NULL; + for (i = 0; i < nvec - 1; i++) { unsigned long kaddr = (unsigned long)iov[i + 1].iov_base; @@ -1051,8 +1058,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, total_entries += 2; sg = kmalloc_array(total_entries, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) + if (!sg) { + kfree(nr_entries); return NULL; + } sg_init_table(sg, total_entries); smb2_sg_set_buf(&sg[sg_idx++], iov[0].iov_base + 24, assoc_data_len); @@ -1086,6 +1095,7 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, } } smb2_sg_set_buf(&sg[sg_idx], sign, SMB2_SIGNATURE_SIZE); + kfree(nr_entries); return sg; } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 2a717d158f02..b6fa1e285c40 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -84,6 +84,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) spin_lock_init(&conn->llist_lock); INIT_LIST_HEAD(&conn->lock_list); + init_rwsem(&conn->session_lock); + down_write(&conn_list_lock); list_add(&conn->conns_list, &conn_list); up_write(&conn_list_lock); @@ -123,28 +125,22 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work) } } -int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) +void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - int ret = 1; if (list_empty(&work->request_entry) && list_empty(&work->async_request_entry)) - return 0; + return; - if (!work->multiRsp) - atomic_dec(&conn->req_running); - if (!work->multiRsp) { - spin_lock(&conn->request_lock); - list_del_init(&work->request_entry); - spin_unlock(&conn->request_lock); - if (work->asynchronous) - release_async_work(work); - ret = 0; - } + atomic_dec(&conn->req_running); + spin_lock(&conn->request_lock); + list_del_init(&work->request_entry); + spin_unlock(&conn->request_lock); + if (work->asynchronous) + release_async_work(work); wake_up_all(&conn->req_running_q); - return ret; } void ksmbd_conn_lock(struct ksmbd_conn *conn) @@ -171,63 +167,31 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status) void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id) { - struct ksmbd_conn *bind_conn; - wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); - - down_read(&conn_list_lock); - list_for_each_entry(bind_conn, &conn_list, conns_list) { - if (bind_conn == conn) - continue; - - if ((bind_conn->binding || xa_load(&bind_conn->sessions, sess_id)) && - !ksmbd_conn_releasing(bind_conn) && - atomic_read(&bind_conn->req_running)) { - wait_event(bind_conn->req_running_q, - atomic_read(&bind_conn->req_running) == 0); - } - } - up_read(&conn_list_lock); } int ksmbd_conn_write(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - size_t len = 0; int sent; - struct kvec iov[3]; - int iov_idx = 0; if (!work->response_buf) { pr_err("NULL response header\n"); return -EINVAL; } - if (work->tr_buf) { - iov[iov_idx] = (struct kvec) { work->tr_buf, - sizeof(struct smb2_transform_hdr) + 4 }; - len += iov[iov_idx++].iov_len; - } + if (work->send_no_response) + return 0; - if (work->aux_payload_sz) { - iov[iov_idx] = (struct kvec) { work->response_buf, work->resp_hdr_sz }; - len += iov[iov_idx++].iov_len; - iov[iov_idx] = (struct kvec) { work->aux_payload_buf, work->aux_payload_sz }; - len += iov[iov_idx++].iov_len; - } else { - if (work->tr_buf) - iov[iov_idx].iov_len = work->resp_hdr_sz; - else - iov[iov_idx].iov_len = get_rfc1002_len(work->response_buf) + 4; - iov[iov_idx].iov_base = work->response_buf; - len += iov[iov_idx++].iov_len; - } + if (!work->iov_idx) + return -EINVAL; ksmbd_conn_lock(conn); - sent = conn->transport->ops->writev(conn->transport, &iov[0], - iov_idx, len, - work->need_invalidate_rkey, - work->remote_key); + sent = conn->transport->ops->writev(conn->transport, work->iov, + work->iov_cnt, + get_rfc1002_len(work->iov[0].iov_base) + 4, + work->need_invalidate_rkey, + work->remote_key); ksmbd_conn_unlock(conn); if (sent < 0) { diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index ad8dfaa48ffb..3c005246a32e 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -50,6 +50,7 @@ struct ksmbd_conn { struct nls_table *local_nls; struct unicode_map *um; struct list_head conns_list; + struct rw_semaphore session_lock; /* smb session 1 per user */ struct xarray sessions; unsigned long last_active; @@ -158,7 +159,7 @@ int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, struct smb2_buffer_desc_v1 *desc, unsigned int desc_len); void ksmbd_conn_enqueue_request(struct ksmbd_work *work); -int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); +void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); int ksmbd_conn_handler_loop(void *p); int ksmbd_conn_transport_init(void); diff --git a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 index 0065f191b54b..001513806fc0 100644 --- a/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 +++ b/fs/smb/server/ksmbd_spnego_negtokeninit.asn1 @@ -1,3 +1,11 @@ +-- SPDX-License-Identifier: BSD-3-Clause +-- +-- Copyright (C) 1998, 2000 IETF Trust and the persons identified as authors +-- of the code +-- +-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1 +-- https://www.rfc-editor.org/rfc/rfc2743#section-3.1 + GSSAPI ::= [APPLICATION 0] IMPLICIT SEQUENCE { thisMech diff --git a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 index 1151933e7b9c..797e485d57f1 100644 --- a/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 +++ b/fs/smb/server/ksmbd_spnego_negtokentarg.asn1 @@ -1,3 +1,10 @@ +-- SPDX-License-Identifier: BSD-3-Clause +-- +-- Copyright (C) 1998 IETF Trust and the persons identified as authors +-- of the code +-- +-- https://www.rfc-editor.org/rfc/rfc2478#section-3.2.1 + GSSAPI ::= CHOICE { negTokenInit diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 14b9caebf7a4..d7c676c151e2 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -27,18 +27,38 @@ struct ksmbd_work *ksmbd_alloc_work_struct(void) INIT_LIST_HEAD(&work->async_request_entry); INIT_LIST_HEAD(&work->fp_entry); INIT_LIST_HEAD(&work->interim_entry); + INIT_LIST_HEAD(&work->aux_read_list); + work->iov_alloc_cnt = 4; + work->iov = kcalloc(work->iov_alloc_cnt, sizeof(struct kvec), + GFP_KERNEL); + if (!work->iov) { + kmem_cache_free(work_cache, work); + work = NULL; + } } return work; } void ksmbd_free_work_struct(struct ksmbd_work *work) { + struct aux_read *ar, *tmp; + WARN_ON(work->saved_cred != NULL); kvfree(work->response_buf); - kvfree(work->aux_payload_buf); + + list_for_each_entry_safe(ar, tmp, &work->aux_read_list, entry) { + kvfree(ar->buf); + list_del(&ar->entry); + kfree(ar); + } + kfree(work->tr_buf); kvfree(work->request_buf); + kfree(work->iov); + if (!list_empty(&work->interim_entry)) + list_del(&work->interim_entry); + if (work->async_id) ksmbd_release_id(&work->conn->async_ida, work->async_id); kmem_cache_free(work_cache, work); @@ -77,3 +97,81 @@ bool ksmbd_queue_work(struct ksmbd_work *work) { return queue_work(ksmbd_wq, &work->work); } + +static inline void __ksmbd_iov_pin(struct ksmbd_work *work, void *ib, + unsigned int ib_len) +{ + work->iov[++work->iov_idx].iov_base = ib; + work->iov[work->iov_idx].iov_len = ib_len; + work->iov_cnt++; +} + +static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size) +{ + struct aux_read *ar = NULL; + int need_iov_cnt = 1; + + if (aux_size) { + need_iov_cnt++; + ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL); + if (!ar) + return -ENOMEM; + } + + if (work->iov_alloc_cnt < work->iov_cnt + need_iov_cnt) { + struct kvec *new; + + work->iov_alloc_cnt += 4; + new = krealloc(work->iov, + sizeof(struct kvec) * work->iov_alloc_cnt, + GFP_KERNEL | __GFP_ZERO); + if (!new) { + kfree(ar); + work->iov_alloc_cnt -= 4; + return -ENOMEM; + } + work->iov = new; + } + + /* Plus rfc_length size on first iov */ + if (!work->iov_idx) { + work->iov[work->iov_idx].iov_base = work->response_buf; + *(__be32 *)work->iov[0].iov_base = 0; + work->iov[work->iov_idx].iov_len = 4; + work->iov_cnt++; + } + + __ksmbd_iov_pin(work, ib, len); + inc_rfc1001_len(work->iov[0].iov_base, len); + + if (aux_size) { + __ksmbd_iov_pin(work, aux_buf, aux_size); + inc_rfc1001_len(work->iov[0].iov_base, aux_size); + + ar->buf = aux_buf; + list_add(&ar->entry, &work->aux_read_list); + } + + return 0; +} + +int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len) +{ + return __ksmbd_iov_pin_rsp(work, ib, len, NULL, 0); +} + +int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size) +{ + return __ksmbd_iov_pin_rsp(work, ib, len, aux_buf, aux_size); +} + +int allocate_interim_rsp_buf(struct ksmbd_work *work) +{ + work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); + if (!work->response_buf) + return -ENOMEM; + work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; + return 0; +} diff --git a/fs/smb/server/ksmbd_work.h b/fs/smb/server/ksmbd_work.h index f8ae6144c0ae..8ca2c813246e 100644 --- a/fs/smb/server/ksmbd_work.h +++ b/fs/smb/server/ksmbd_work.h @@ -19,6 +19,11 @@ enum { KSMBD_WORK_CLOSED, }; +struct aux_read { + void *buf; + struct list_head entry; +}; + /* one of these for every pending CIFS request at the connection */ struct ksmbd_work { /* Server corresponding to this mid */ @@ -31,13 +36,19 @@ struct ksmbd_work { /* Response buffer */ void *response_buf; - /* Read data buffer */ - void *aux_payload_buf; + struct list_head aux_read_list; + + struct kvec *iov; + int iov_alloc_cnt; + int iov_cnt; + int iov_idx; /* Next cmd hdr in compound req buf*/ int next_smb2_rcv_hdr_off; /* Next cmd hdr in compound rsp buf*/ int next_smb2_rsp_hdr_off; + /* Current cmd hdr in compound rsp buf*/ + int curr_smb2_rsp_hdr_off; /* * Current Local FID assigned compound response if SMB2 CREATE @@ -53,16 +64,11 @@ struct ksmbd_work { unsigned int credits_granted; /* response smb header size */ - unsigned int resp_hdr_sz; unsigned int response_sz; - /* Read data count */ - unsigned int aux_payload_sz; void *tr_buf; unsigned char state; - /* Multiple responses for one request e.g. SMB ECHO */ - bool multiRsp:1; /* No response for cancelled request */ bool send_no_response:1; /* Request is encrypted */ @@ -96,6 +102,15 @@ static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work) } /** + * ksmbd_resp_buf_curr - Get current buffer on compound response. + * @work: smb work containing response buffer + */ +static inline void *ksmbd_resp_buf_curr(struct ksmbd_work *work) +{ + return work->response_buf + work->curr_smb2_rsp_hdr_off + 4; +} + +/** * ksmbd_req_buf_next - Get next buffer on compound request. * @work: smb work containing response buffer */ @@ -113,5 +128,8 @@ int ksmbd_work_pool_init(void); int ksmbd_workqueue_init(void); void ksmbd_workqueue_destroy(void); bool ksmbd_queue_work(struct ksmbd_work *work); - +int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len, + void *aux_buf, unsigned int aux_size); +int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len); +int allocate_interim_rsp_buf(struct ksmbd_work *work); #endif /* __KSMBD_WORK_H__ */ diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h index 3fd338293942..5f591751b923 100644 --- a/fs/smb/server/mgmt/share_config.h +++ b/fs/smb/server/mgmt/share_config.h @@ -34,29 +34,22 @@ struct ksmbd_share_config { #define KSMBD_SHARE_INVALID_UID ((__u16)-1) #define KSMBD_SHARE_INVALID_GID ((__u16)-1) -static inline int share_config_create_mode(struct ksmbd_share_config *share, - umode_t posix_mode) +static inline umode_t +share_config_create_mode(struct ksmbd_share_config *share, + umode_t posix_mode) { - if (!share->force_create_mode) { - if (!posix_mode) - return share->create_mask; - else - return posix_mode & share->create_mask; - } - return share->force_create_mode & share->create_mask; + umode_t mode = (posix_mode ?: (umode_t)-1) & share->create_mask; + + return mode | share->force_create_mode; } -static inline int share_config_directory_mode(struct ksmbd_share_config *share, - umode_t posix_mode) +static inline umode_t +share_config_directory_mode(struct ksmbd_share_config *share, + umode_t posix_mode) { - if (!share->force_directory_mode) { - if (!posix_mode) - return share->directory_mask; - else - return posix_mode & share->directory_mask; - } + umode_t mode = (posix_mode ?: (umode_t)-1) & share->directory_mask; - return share->force_directory_mode & share->directory_mask; + return mode | share->force_directory_mode; } static inline int test_share_config_flag(struct ksmbd_share_config *share, diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index 408cddf2f094..d2c81a8a11dd 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -73,7 +73,10 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, tree_conn->user = sess->user; tree_conn->share_conf = sc; + tree_conn->t_state = TREE_NEW; status.tree_conn = tree_conn; + atomic_set(&tree_conn->refcount, 1); + init_waitqueue_head(&tree_conn->refcount_q); ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn, GFP_KERNEL)); @@ -93,14 +96,33 @@ out_error: return status; } +void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon) +{ + /* + * Checking waitqueue to releasing tree connect on + * tree disconnect. waitqueue_active is safe because it + * uses atomic operation for condition. + */ + if (!atomic_dec_return(&tcon->refcount) && + waitqueue_active(&tcon->refcount_q)) + wake_up(&tcon->refcount_q); +} + int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn) { int ret; + write_lock(&sess->tree_conns_lock); + xa_erase(&sess->tree_conns, tree_conn->id); + write_unlock(&sess->tree_conns_lock); + + if (!atomic_dec_and_test(&tree_conn->refcount)) + wait_event(tree_conn->refcount_q, + atomic_read(&tree_conn->refcount) == 0); + ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id); ksmbd_release_tree_conn_id(sess, tree_conn->id); - xa_erase(&sess->tree_conns, tree_conn->id); ksmbd_share_config_put(tree_conn->share_conf); kfree(tree_conn); return ret; @@ -111,11 +133,15 @@ struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, { struct ksmbd_tree_connect *tcon; + read_lock(&sess->tree_conns_lock); tcon = xa_load(&sess->tree_conns, id); if (tcon) { - if (test_bit(TREE_CONN_EXPIRE, &tcon->status)) + if (tcon->t_state != TREE_CONNECTED) + tcon = NULL; + else if (!atomic_inc_not_zero(&tcon->refcount)) tcon = NULL; } + read_unlock(&sess->tree_conns_lock); return tcon; } @@ -129,8 +155,18 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess) if (!sess) return -EINVAL; - xa_for_each(&sess->tree_conns, id, tc) + xa_for_each(&sess->tree_conns, id, tc) { + write_lock(&sess->tree_conns_lock); + if (tc->t_state == TREE_DISCONNECTED) { + write_unlock(&sess->tree_conns_lock); + ret = -ENOENT; + continue; + } + tc->t_state = TREE_DISCONNECTED; + write_unlock(&sess->tree_conns_lock); + ret |= ksmbd_tree_conn_disconnect(sess, tc); + } xa_destroy(&sess->tree_conns); return ret; } diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h index 562d647ad9fa..6377a70b811c 100644 --- a/fs/smb/server/mgmt/tree_connect.h +++ b/fs/smb/server/mgmt/tree_connect.h @@ -14,7 +14,11 @@ struct ksmbd_share_config; struct ksmbd_user; struct ksmbd_conn; -#define TREE_CONN_EXPIRE 1 +enum { + TREE_NEW = 0, + TREE_CONNECTED, + TREE_DISCONNECTED +}; struct ksmbd_tree_connect { int id; @@ -27,7 +31,9 @@ struct ksmbd_tree_connect { int maximal_access; bool posix_extensions; - unsigned long status; + atomic_t refcount; + wait_queue_head_t refcount_q; + unsigned int t_state; }; struct ksmbd_tree_conn_status { @@ -46,6 +52,7 @@ struct ksmbd_session; struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, const char *share_name); +void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn); diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h index 6a44109617f1..e068a19fd904 100644 --- a/fs/smb/server/mgmt/user_config.h +++ b/fs/smb/server/mgmt/user_config.h @@ -18,7 +18,6 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; - unsigned int failed_login_count; }; static inline bool user_guest(struct ksmbd_user *user) diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 8a5dcab05614..15f68ee05089 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -174,7 +174,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) unsigned long id; struct ksmbd_session *sess; - down_write(&sessions_table_lock); + down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { if (sess->state != SMB2_SESSION_VALID || time_after(jiffies, @@ -185,7 +185,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) continue; } } - up_write(&sessions_table_lock); + up_write(&conn->session_lock); } int ksmbd_session_register(struct ksmbd_conn *conn, @@ -227,7 +227,9 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } } + up_write(&sessions_table_lock); + down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { unsigned long chann_id; struct channel *chann; @@ -244,7 +246,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) ksmbd_session_destroy(sess); } } - up_write(&sessions_table_lock); + up_write(&conn->session_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, @@ -252,9 +254,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, { struct ksmbd_session *sess; + down_read(&conn->session_lock); sess = xa_load(&conn->sessions, id); if (sess) sess->last_active = jiffies; + up_read(&conn->session_lock); return sess; } @@ -351,6 +355,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->ksmbd_chann_list); xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; + rwlock_init(&sess->tree_conns_lock); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index f99d475b28db..63cb08fffde8 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -60,6 +60,7 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; + rwlock_t tree_conns_lock; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 844b303baf29..562b180459a1 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -102,9 +102,10 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx) lease->new_state = 0; lease->flags = lctx->flags; lease->duration = lctx->duration; + lease->is_dir = lctx->is_dir; memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE); lease->version = lctx->version; - lease->epoch = 0; + lease->epoch = le16_to_cpu(lctx->epoch); INIT_LIST_HEAD(&opinfo->lease_entry); opinfo->o_lease = lease; @@ -395,8 +396,8 @@ void close_id_del_oplock(struct ksmbd_file *fp) { struct oplock_info *opinfo; - if (S_ISDIR(file_inode(fp->filp)->i_mode)) - return; + if (fp->reserve_lease_break) + smb_lazy_parent_lease_break_close(fp); opinfo = opinfo_get(fp); if (!opinfo) @@ -543,12 +544,13 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, /* upgrading lease */ if ((atomic_read(&ci->op_count) + atomic_read(&ci->sop_count)) == 1) { - if (lease->state == - (lctx->req_state & lease->state)) { + if (lease->state != SMB2_LEASE_NONE_LE && + lease->state == (lctx->req_state & lease->state)) { lease->state |= lctx->req_state; if (lctx->req_state & SMB2_LEASE_WRITE_CACHING_LE) lease_read_to_write(opinfo); + } } else if ((atomic_read(&ci->op_count) + atomic_read(&ci->sop_count)) > 1) { @@ -616,15 +618,6 @@ static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level) return 0; } -static inline int allocate_oplock_break_buf(struct ksmbd_work *work) -{ - work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); - if (!work->response_buf) - return -ENOMEM; - work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; - return 0; -} - /** * __smb2_oplock_break_noti() - send smb2 oplock break cmd from conn * to client @@ -639,7 +632,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) { struct smb2_oplock_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); - struct ksmbd_conn *conn = work->conn; struct oplock_break_info *br_info = work->request_buf; struct smb2_hdr *rsp_hdr; struct ksmbd_file *fp; @@ -648,7 +640,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) if (!fp) goto out; - if (allocate_oplock_break_buf(work)) { + if (allocate_interim_rsp_buf(work)) { pr_err("smb2_allocate_rsp_buf failed! "); ksmbd_fd_put(work, fp); goto out; @@ -656,8 +648,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->CreditRequest = cpu_to_le16(0); @@ -684,13 +674,15 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) rsp->PersistentFid = fp->persistent_id; rsp->VolatileFid = fp->volatile_id; - inc_rfc1001_len(work->response_buf, 24); + ksmbd_fd_put(work, fp); + if (ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_oplock_break))) + goto out; ksmbd_debug(OPLOCK, "sending oplock break v_id %llu p_id = %llu lock level = %d\n", rsp->VolatileFid, rsp->PersistentFid, rsp->OplockLevel); - ksmbd_fd_put(work, fp); ksmbd_conn_write(work); out: @@ -751,18 +743,15 @@ static void __smb2_lease_break_noti(struct work_struct *wk) struct smb2_lease_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); struct lease_break_info *br_info = work->request_buf; - struct ksmbd_conn *conn = work->conn; struct smb2_hdr *rsp_hdr; - if (allocate_oplock_break_buf(work)) { + if (allocate_interim_rsp_buf(work)) { ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! "); goto out; } rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->CreditRequest = cpu_to_le16(0); @@ -791,7 +780,9 @@ static void __smb2_lease_break_noti(struct work_struct *wk) rsp->AccessMaskHint = 0; rsp->ShareMaskHint = 0; - inc_rfc1001_len(work->response_buf, 44); + if (ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_lease_break))) + goto out; ksmbd_conn_write(work); @@ -844,7 +835,8 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo) interim_entry); setup_async_work(in_work, NULL, NULL); smb2_send_interim_resp(in_work, STATUS_PENDING); - list_del(&in_work->interim_entry); + list_del_init(&in_work->interim_entry); + release_async_work(in_work); } INIT_WORK(&work->work, __smb2_lease_break_noti); ksmbd_queue_work(work); @@ -910,7 +902,8 @@ static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level) lease->new_state = SMB2_LEASE_READ_CACHING_LE; } else { - if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE) + if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE && + !lease->is_dir) lease->new_state = SMB2_LEASE_READ_CACHING_LE; else @@ -1042,6 +1035,7 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2) SMB2_LEASE_KEY_SIZE); lease2->duration = lease1->duration; lease2->flags = lease1->flags; + lease2->epoch = lease1->epoch++; } static int add_lease_global_list(struct oplock_info *opinfo) @@ -1091,6 +1085,89 @@ static void set_oplock_level(struct oplock_info *opinfo, int level, } } +void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, + struct lease_ctx_info *lctx) +{ + struct oplock_info *opinfo; + struct ksmbd_inode *p_ci = NULL; + + if (lctx->version != 2) + return; + + p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent); + if (!p_ci) + return; + + read_lock(&p_ci->m_lock); + list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) { + if (!opinfo->is_lease) + continue; + + if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE && + (!(lctx->flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) || + !compare_guid_key(opinfo, fp->conn->ClientGUID, + lctx->parent_lease_key))) { + if (!atomic_inc_not_zero(&opinfo->refcount)) + continue; + + atomic_inc(&opinfo->conn->r_count); + if (ksmbd_conn_releasing(opinfo->conn)) { + atomic_dec(&opinfo->conn->r_count); + continue; + } + + read_unlock(&p_ci->m_lock); + oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); + opinfo_conn_put(opinfo); + read_lock(&p_ci->m_lock); + } + } + read_unlock(&p_ci->m_lock); + + ksmbd_inode_put(p_ci); +} + +void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp) +{ + struct oplock_info *opinfo; + struct ksmbd_inode *p_ci = NULL; + + rcu_read_lock(); + opinfo = rcu_dereference(fp->f_opinfo); + rcu_read_unlock(); + + if (!opinfo->is_lease || opinfo->o_lease->version != 2) + return; + + p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent); + if (!p_ci) + return; + + read_lock(&p_ci->m_lock); + list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) { + if (!opinfo->is_lease) + continue; + + if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE) { + if (!atomic_inc_not_zero(&opinfo->refcount)) + continue; + + atomic_inc(&opinfo->conn->r_count); + if (ksmbd_conn_releasing(opinfo->conn)) { + atomic_dec(&opinfo->conn->r_count); + continue; + } + read_unlock(&p_ci->m_lock); + oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); + opinfo_conn_put(opinfo); + read_lock(&p_ci->m_lock); + } + } + read_unlock(&p_ci->m_lock); + + ksmbd_inode_put(p_ci); +} + /** * smb_grant_oplock() - handle oplock/lease request on file open * @work: smb work @@ -1114,10 +1191,6 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, bool prev_op_has_lease; __le32 prev_op_state = 0; - /* not support directory lease */ - if (S_ISDIR(file_inode(fp->filp)->i_mode)) - return 0; - opinfo = alloc_opinfo(work, pid, tid); if (!opinfo) return -ENOMEM; @@ -1374,6 +1447,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) memcpy(buf->lcontext.LeaseKey, lease->lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseFlags = lease->flags; + buf->lcontext.Epoch = cpu_to_le16(++lease->epoch); buf->lcontext.LeaseState = lease->state; memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key, SMB2_LEASE_KEY_SIZE); @@ -1410,10 +1484,11 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) /** * parse_lease_state() - parse lease context containted in file open request * @open_req: buffer containing smb2 file open(create) request + * @is_dir: whether leasing file is directory * * Return: oplock state, -ENOENT if create lease context not found */ -struct lease_ctx_info *parse_lease_state(void *open_req) +struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir) { struct create_context *cc; struct smb2_create_req *req = (struct smb2_create_req *)open_req; @@ -1431,8 +1506,14 @@ struct lease_ctx_info *parse_lease_state(void *open_req) struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); - lreq->req_state = lc->lcontext.LeaseState; + if (is_dir) { + lreq->req_state = lc->lcontext.LeaseState & + ~SMB2_LEASE_WRITE_CACHING_LE; + lreq->is_dir = true; + } else + lreq->req_state = lc->lcontext.LeaseState; lreq->flags = lc->lcontext.LeaseFlags; + lreq->epoch = lc->lcontext.Epoch; lreq->duration = lc->lcontext.LeaseDuration; memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey, SMB2_LEASE_KEY_SIZE); @@ -1492,7 +1573,7 @@ struct create_context *smb2_find_context_vals(void *open_req, const char *tag, i name_len < 4 || name_off + name_len > cc_len || (value_off & 0x7) != 0 || - (value_off && (value_off < name_off + name_len)) || + (value_len && value_off < name_off + (name_len < 8 ? 8 : name_len)) || ((u64)value_off + value_len > cc_len)) return ERR_PTR(-EINVAL); diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h index 4b0fe6da7694..5b93ea9196c0 100644 --- a/fs/smb/server/oplock.h +++ b/fs/smb/server/oplock.h @@ -34,7 +34,9 @@ struct lease_ctx_info { __le32 flags; __le64 duration; __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE]; + __le16 epoch; int version; + bool is_dir; }; struct lease_table { @@ -53,6 +55,7 @@ struct lease { __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE]; int version; unsigned short epoch; + bool is_dir; struct lease_table *l_lb; }; @@ -108,7 +111,7 @@ void opinfo_put(struct oplock_info *opinfo); /* Lease related functions */ void create_lease_buf(u8 *rbuf, struct lease *lease); -struct lease_ctx_info *parse_lease_state(void *open_req); +struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir); __u8 smb2_map_lease_to_oplock(__le32 lease_state); int lease_read_to_write(struct oplock_info *opinfo); @@ -124,4 +127,7 @@ struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn, int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci, struct lease_ctx_info *lctx); void destroy_lease_table(struct ksmbd_conn *conn); +void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, + struct lease_ctx_info *lctx); +void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp); #endif /* __KSMBD_OPLOCK_H */ diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 9df121bdf349..3079e607c5fe 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -115,8 +115,10 @@ static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn, if (check_conn_state(work)) return SERVER_HANDLER_CONTINUE; - if (ksmbd_verify_smb_message(work)) + if (ksmbd_verify_smb_message(work)) { + conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); return SERVER_HANDLER_ABORT; + } command = conn->ops->get_cmd_val(work); *cmd = command; @@ -163,6 +165,7 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, { u16 command = 0; int rc; + bool is_chained = false; if (conn->ops->allocate_rsp_buf(work)) return; @@ -229,16 +232,17 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } } + is_chained = is_chained_smb2_message(work); + if (work->sess && (work->sess->sign || smb3_11_final_sess_setup_resp(work) || conn->ops->is_sign_req(work, command))) conn->ops->set_sign_rsp(work); - } while (is_chained_smb2_message(work)); - - if (work->send_no_response) - return; + } while (is_chained == true); send: + if (work->tcon) + ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { @@ -590,8 +594,6 @@ static int __init ksmbd_server_init(void) if (ret) goto err_crypto_destroy; - pr_warn_once("The ksmbd server is experimental\n"); - return 0; err_crypto_destroy: diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index e881df1d10cb..23bd3d1209df 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -440,10 +440,8 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) validate_credit: if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && - smb2_validate_credit_charge(work->conn, hdr)) { - work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + smb2_validate_credit_charge(work->conn, hdr)) return 1; - } return 0; } diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c index aed7704a0672..27a9dce3e03a 100644 --- a/fs/smb/server/smb2ops.c +++ b/fs/smb/server/smb2ops.c @@ -221,7 +221,8 @@ void init_smb3_0_server(struct ksmbd_conn *conn) conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) - conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING | + SMB2_GLOBAL_CAP_DIRECTORY_LEASING; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION && conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) @@ -245,7 +246,8 @@ void init_smb3_02_server(struct ksmbd_conn *conn) conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) - conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING | + SMB2_GLOBAL_CAP_DIRECTORY_LEASING; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && @@ -270,7 +272,8 @@ int init_smb3_11_server(struct ksmbd_conn *conn) conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) - conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING | + SMB2_GLOBAL_CAP_DIRECTORY_LEASING; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION || (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) && diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 7cc1b0c47d0a..652ab429bf2e 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -145,12 +145,18 @@ void smb2_set_err_rsp(struct ksmbd_work *work) err_rsp = smb2_get_msg(work->response_buf); if (err_rsp->hdr.Status != STATUS_STOPPED_ON_SYMLINK) { + int err; + err_rsp->StructureSize = SMB2_ERROR_STRUCTURE_SIZE2_LE; err_rsp->ErrorContextCount = 0; err_rsp->Reserved = 0; err_rsp->ByteCount = 0; err_rsp->ErrorData[0] = 0; - inc_rfc1001_len(work->response_buf, SMB2_ERROR_STRUCTURE_SIZE2); + err = ksmbd_iov_pin_rsp(work, (void *)err_rsp, + __SMB2_HEADER_STRUCTURE_SIZE + + SMB2_ERROR_STRUCTURE_SIZE2); + if (err) + work->send_no_response = 1; } } @@ -225,11 +231,12 @@ void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err) { struct smb2_hdr *rsp_hdr; - if (work->next_smb2_rcv_hdr_off) - rsp_hdr = ksmbd_resp_buf_next(work); - else - rsp_hdr = smb2_get_msg(work->response_buf); + rsp_hdr = smb2_get_msg(work->response_buf); rsp_hdr->Status = err; + + work->iov_idx = 0; + work->iov_cnt = 0; + work->next_smb2_rcv_hdr_off = 0; smb2_set_err_rsp(work); } @@ -245,9 +252,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) struct smb2_hdr *rsp_hdr; struct smb2_negotiate_rsp *rsp; struct ksmbd_conn *conn = work->conn; - - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); + int err; rsp_hdr = smb2_get_msg(work->response_buf); memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); @@ -286,12 +291,13 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); - inc_rfc1001_len(work->response_buf, - sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; + err = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_negotiate_rsp) + AUTH_GSS_LENGTH); + if (err) + return err; conn->use_spnego = true; ksmbd_conn_set_need_negotiate(conn); @@ -390,11 +396,12 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) next_hdr_offset = le32_to_cpu(req->NextCommand); new_len = ALIGN(len, 8); - inc_rfc1001_len(work->response_buf, - sizeof(struct smb2_hdr) + new_len - len); + work->iov[work->iov_idx].iov_len += (new_len - len); + inc_rfc1001_len(work->response_buf, new_len - len); rsp->NextCommand = cpu_to_le32(new_len); work->next_smb2_rcv_hdr_off += next_hdr_offset; + work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off; work->next_smb2_rsp_hdr_off += new_len; ksmbd_debug(SMB, "Compound req new_len = %d rcv off = %d rsp off = %d\n", @@ -470,10 +477,10 @@ bool is_chained_smb2_message(struct ksmbd_work *work) len = len - get_rfc1002_len(work->response_buf); if (len) { ksmbd_debug(SMB, "padding len %u\n", len); + work->iov[work->iov_idx].iov_len += len; inc_rfc1001_len(work->response_buf, len); - if (work->aux_payload_sz) - work->aux_payload_sz += len; } + work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off; } return false; } @@ -488,11 +495,8 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work) { struct smb2_hdr *rsp_hdr = smb2_get_msg(work->response_buf); struct smb2_hdr *rcv_hdr = smb2_get_msg(work->request_buf); - struct ksmbd_conn *conn = work->conn; memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); - *(__be32 *)work->response_buf = - cpu_to_be32(conn->vals->header_size); rsp_hdr->ProtocolId = rcv_hdr->ProtocolId; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->Command = rcv_hdr->Command; @@ -653,13 +657,9 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) { - struct smb2_hdr *rsp_hdr; struct ksmbd_conn *conn = work->conn; int id; - rsp_hdr = smb2_get_msg(work->response_buf); - rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND; - id = ksmbd_acquire_async_msg_id(&conn->async_ida); if (id < 0) { pr_err("Failed to alloc async message id\n"); @@ -667,7 +667,6 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) } work->asynchronous = true; work->async_id = id; - rsp_hdr->Id.AsyncId = cpu_to_le64(id); ksmbd_debug(SMB, "Send interim Response to inform async request id : %d\n", @@ -706,15 +705,26 @@ void release_async_work(struct ksmbd_work *work) void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status) { struct smb2_hdr *rsp_hdr; + struct ksmbd_work *in_work = ksmbd_alloc_work_struct(); - rsp_hdr = smb2_get_msg(work->response_buf); - smb2_set_err_rsp(work); + if (allocate_interim_rsp_buf(in_work)) { + pr_err("smb_allocate_rsp_buf failed!\n"); + ksmbd_free_work_struct(in_work); + return; + } + + in_work->conn = work->conn; + memcpy(smb2_get_msg(in_work->response_buf), ksmbd_resp_buf_next(work), + __SMB2_HEADER_STRUCTURE_SIZE); + + rsp_hdr = smb2_get_msg(in_work->response_buf); + rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND; + rsp_hdr->Id.AsyncId = cpu_to_le64(work->async_id); + smb2_set_err_rsp(in_work); rsp_hdr->Status = status; - work->multiRsp = 1; - ksmbd_conn_write(work); - rsp_hdr->Status = 0; - work->multiRsp = 0; + ksmbd_conn_write(in_work); + ksmbd_free_work_struct(in_work); } static __le32 smb2_get_reparse_tag_special_file(umode_t mode) @@ -821,9 +831,8 @@ static void build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) pneg_ctxt->Name[15] = 0x7C; } -static void assemble_neg_contexts(struct ksmbd_conn *conn, - struct smb2_negotiate_rsp *rsp, - void *smb2_buf_len) +static unsigned int assemble_neg_contexts(struct ksmbd_conn *conn, + struct smb2_negotiate_rsp *rsp) { char * const pneg_ctxt = (char *)rsp + le32_to_cpu(rsp->NegotiateContextOffset); @@ -834,7 +843,6 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n"); build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt, conn->preauth_info->Preauth_HashId); - inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING); ctxt_size = sizeof(struct smb2_preauth_neg_context); if (conn->cipher_type) { @@ -874,7 +882,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn, } rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); - inc_rfc1001_len(smb2_buf_len, ctxt_size); + return ctxt_size + AUTH_GSS_PADDING; } static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, @@ -1090,7 +1098,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) struct smb2_negotiate_req *req = smb2_get_msg(work->request_buf); struct smb2_negotiate_rsp *rsp = smb2_get_msg(work->response_buf); int rc = 0; - unsigned int smb2_buf_len, smb2_neg_size; + unsigned int smb2_buf_len, smb2_neg_size, neg_ctxt_len = 0; __le32 status; ksmbd_debug(SMB, "Received negotiate request\n"); @@ -1183,7 +1191,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) conn->preauth_info->Preauth_HashValue); rsp->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); - assemble_neg_contexts(conn, rsp, work->response_buf); + neg_ctxt_len = assemble_neg_contexts(conn, rsp); break; case SMB302_PROT_ID: init_smb3_02_server(conn); @@ -1233,8 +1241,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) + le16_to_cpu(rsp->SecurityBufferOffset)); - inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) - - sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH); + rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; conn->use_spnego = true; @@ -1252,9 +1259,15 @@ int smb2_handle_negotiate(struct ksmbd_work *work) ksmbd_conn_set_need_negotiate(conn); err_out: + if (rc) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + + if (!rc) + rc = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_negotiate_rsp) + + AUTH_GSS_LENGTH + neg_ctxt_len); if (rc < 0) smb2_set_err_rsp(work); - return rc; } @@ -1454,7 +1467,6 @@ static int ntlm_authenticate(struct ksmbd_work *work, memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); kfree(spnego_blob); - inc_rfc1001_len(work->response_buf, spnego_blob_len - 1); } user = session_user(conn, req); @@ -1600,7 +1612,6 @@ static int krb5_authenticate(struct ksmbd_work *work, return -EINVAL; } rsp->SecurityBufferLength = cpu_to_le16(out_len); - inc_rfc1001_len(work->response_buf, out_len - 1); if ((conn->sign || server_conf.enforced_signing) || (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) @@ -1672,7 +1683,6 @@ int smb2_sess_setup(struct ksmbd_work *work) rsp->SessionFlags = 0; rsp->SecurityBufferOffset = cpu_to_le16(72); rsp->SecurityBufferLength = 0; - inc_rfc1001_len(work->response_buf, 9); ksmbd_conn_lock(conn); if (!req->hdr.SessionId) { @@ -1808,13 +1818,6 @@ int smb2_sess_setup(struct ksmbd_work *work) goto out_err; rsp->hdr.Status = STATUS_MORE_PROCESSING_REQUIRED; - /* - * Note: here total size -1 is done as an - * adjustment for 0 size blob - */ - inc_rfc1001_len(work->response_buf, - le16_to_cpu(rsp->SecurityBufferLength) - 1); - } else if (negblob->MessageType == NtLmAuthenticate) { rc = ntlm_authenticate(work, req, rsp); if (rc) @@ -1899,6 +1902,18 @@ out_err: ksmbd_conn_set_need_negotiate(conn); } } + smb2_set_err_rsp(work); + } else { + unsigned int iov_len; + + if (rsp->SecurityBufferLength) + iov_len = offsetof(struct smb2_sess_setup_rsp, Buffer) + + le16_to_cpu(rsp->SecurityBufferLength); + else + iov_len = sizeof(struct smb2_sess_setup_rsp); + rc = ksmbd_iov_pin_rsp(work, rsp, iov_len); + if (rc) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; } ksmbd_conn_unlock(conn); @@ -1976,14 +1991,20 @@ int smb2_tree_connect(struct ksmbd_work *work) if (conn->posix_ext_supported) status.tree_conn->posix_extensions = true; + write_lock(&sess->tree_conns_lock); + status.tree_conn->t_state = TREE_CONNECTED; + write_unlock(&sess->tree_conns_lock); rsp->StructureSize = cpu_to_le16(16); - inc_rfc1001_len(work->response_buf, 16); out_err1: rsp->Capabilities = 0; rsp->Reserved = 0; /* default manual caching */ rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING; + rc = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_tree_connect_rsp)); + if (rc) + status.ret = KSMBD_TREE_CONN_STATUS_NOMEM; + if (!IS_ERR(treename)) kfree(treename); if (!IS_ERR(name)) @@ -2096,26 +2117,56 @@ int smb2_tree_disconnect(struct ksmbd_work *work) struct smb2_tree_disconnect_req *req; struct ksmbd_session *sess = work->sess; struct ksmbd_tree_connect *tcon = work->tcon; + int err; WORK_BUFFERS(work, req, rsp); - rsp->StructureSize = cpu_to_le16(4); - inc_rfc1001_len(work->response_buf, 4); - ksmbd_debug(SMB, "request\n"); - if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) { + if (!tcon) { ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; - smb2_set_err_rsp(work); - return 0; + err = -ENOENT; + goto err_out; } ksmbd_close_tree_conn_fds(work); - ksmbd_tree_conn_disconnect(sess, tcon); + + write_lock(&sess->tree_conns_lock); + if (tcon->t_state == TREE_DISCONNECTED) { + write_unlock(&sess->tree_conns_lock); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + err = -ENOENT; + goto err_out; + } + + WARN_ON_ONCE(atomic_dec_and_test(&tcon->refcount)); + tcon->t_state = TREE_DISCONNECTED; + write_unlock(&sess->tree_conns_lock); + + err = ksmbd_tree_conn_disconnect(sess, tcon); + if (err) { + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + goto err_out; + } + work->tcon = NULL; + + rsp->StructureSize = cpu_to_le16(4); + err = ksmbd_iov_pin_rsp(work, rsp, + sizeof(struct smb2_tree_disconnect_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + goto err_out; + } + return 0; + +err_out: + smb2_set_err_rsp(work); + return err; + } /** @@ -2131,17 +2182,23 @@ int smb2_session_logoff(struct ksmbd_work *work) struct smb2_logoff_rsp *rsp; struct ksmbd_session *sess; u64 sess_id; + int err; WORK_BUFFERS(work, req, rsp); - sess_id = le64_to_cpu(req->hdr.SessionId); - - rsp->StructureSize = cpu_to_le16(4); - inc_rfc1001_len(work->response_buf, 4); - ksmbd_debug(SMB, "request\n"); + ksmbd_conn_lock(conn); + if (!ksmbd_conn_good(conn)) { + ksmbd_conn_unlock(conn); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + smb2_set_err_rsp(work); + return -ENOENT; + } + sess_id = le64_to_cpu(req->hdr.SessionId); ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT); + ksmbd_conn_unlock(conn); + ksmbd_close_session_fds(work); ksmbd_conn_wait_idle(conn, sess_id); @@ -2154,7 +2211,7 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); - return 0; + return -ENOENT; } ksmbd_destroy_file_table(&sess->file_table); @@ -2163,6 +2220,14 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_free_user(sess->user); sess->user = NULL; ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); + + rsp->StructureSize = cpu_to_le16(4); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + smb2_set_err_rsp(work); + return err; + } return 0; } @@ -2215,7 +2280,10 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work) rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; - inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/ + err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_create_rsp, Buffer)); + if (err) + goto out; + kfree(name); return 0; @@ -2309,7 +2377,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, rc = 0; } else { rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value, - le16_to_cpu(eabuf->EaValueLength), 0); + le16_to_cpu(eabuf->EaValueLength), + 0, true); if (rc < 0) { ksmbd_debug(SMB, "ksmbd_vfs_setxattr is failed(%d)\n", @@ -2372,7 +2441,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0); + rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0, false); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); return 0; @@ -2447,7 +2516,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da, true); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); } @@ -2537,7 +2606,7 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, sizeof(struct create_sd_buf_req)) return -EINVAL; return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd, - le32_to_cpu(sd_buf->ccontext.DataLength), true); + le32_to_cpu(sd_buf->ccontext.DataLength), true, false); } static void ksmbd_acls_fattr(struct smb_fattr *fattr, @@ -2597,6 +2666,7 @@ int smb2_open(struct ksmbd_work *work) u64 time; umode_t posix_mode = 0; __le32 daccess, maximal_access = 0; + int iov_len = 0; WORK_BUFFERS(work, req, rsp); @@ -2618,7 +2688,7 @@ int smb2_open(struct ksmbd_work *work) *(char *)req->Buffer == '\\') { pr_err("not allow directory name included leading slash\n"); rc = -EINVAL; - goto err_out1; + goto err_out2; } name = smb2_get_name(req->Buffer, @@ -2629,7 +2699,7 @@ int smb2_open(struct ksmbd_work *work) if (rc != -ENOMEM) rc = -ENOENT; name = NULL; - goto err_out1; + goto err_out2; } ksmbd_debug(SMB, "converted name = %s\n", name); @@ -2637,48 +2707,44 @@ int smb2_open(struct ksmbd_work *work) if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_STREAMS)) { rc = -EBADF; - goto err_out1; + goto err_out2; } rc = parse_stream_name(name, &stream_name, &s_type); if (rc < 0) - goto err_out1; + goto err_out2; } rc = ksmbd_validate_filename(name); if (rc < 0) - goto err_out1; + goto err_out2; if (ksmbd_share_veto_filename(share, name)) { rc = -ENOENT; ksmbd_debug(SMB, "Reject open(), vetoed file: %s\n", name); - goto err_out1; + goto err_out2; } } else { name = kstrdup("", GFP_KERNEL); if (!name) { rc = -ENOMEM; - goto err_out1; + goto err_out2; } } - req_op_level = req->RequestedOplockLevel; - if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) - lc = parse_lease_state(req); - if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE)) { pr_err("Invalid impersonationlevel : 0x%x\n", le32_to_cpu(req->ImpersonationLevel)); rc = -EIO; rsp->hdr.Status = STATUS_BAD_IMPERSONATION_LEVEL; - goto err_out1; + goto err_out2; } if (req->CreateOptions && !(req->CreateOptions & CREATE_OPTIONS_MASK_LE)) { pr_err("Invalid create options : 0x%x\n", le32_to_cpu(req->CreateOptions)); rc = -EINVAL; - goto err_out1; + goto err_out2; } else { if (req->CreateOptions & FILE_SEQUENTIAL_ONLY_LE && req->CreateOptions & FILE_RANDOM_ACCESS_LE) @@ -2688,13 +2754,13 @@ int smb2_open(struct ksmbd_work *work) (FILE_OPEN_BY_FILE_ID_LE | CREATE_TREE_CONNECTION | FILE_RESERVE_OPFILTER_LE)) { rc = -EOPNOTSUPP; - goto err_out1; + goto err_out2; } if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { if (req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE) { rc = -EINVAL; - goto err_out1; + goto err_out2; } else if (req->CreateOptions & FILE_NO_COMPRESSION_LE) { req->CreateOptions = ~(FILE_NO_COMPRESSION_LE); } @@ -2706,21 +2772,21 @@ int smb2_open(struct ksmbd_work *work) pr_err("Invalid create disposition : 0x%x\n", le32_to_cpu(req->CreateDisposition)); rc = -EINVAL; - goto err_out1; + goto err_out2; } if (!(req->DesiredAccess & DESIRED_ACCESS_MASK)) { pr_err("Invalid desired access : 0x%x\n", le32_to_cpu(req->DesiredAccess)); rc = -EACCES; - goto err_out1; + goto err_out2; } if (req->FileAttributes && !(req->FileAttributes & FILE_ATTRIBUTE_MASK_LE)) { pr_err("Invalid file attribute : 0x%x\n", le32_to_cpu(req->FileAttributes)); rc = -EINVAL; - goto err_out1; + goto err_out2; } if (req->CreateContextsOffset) { @@ -2728,19 +2794,19 @@ int smb2_open(struct ksmbd_work *work) context = smb2_find_context_vals(req, SMB2_CREATE_EA_BUFFER, 4); if (IS_ERR(context)) { rc = PTR_ERR(context); - goto err_out1; + goto err_out2; } else if (context) { ea_buf = (struct create_ea_buf_req *)context; if (le16_to_cpu(context->DataOffset) + le32_to_cpu(context->DataLength) < sizeof(struct create_ea_buf_req)) { rc = -EINVAL; - goto err_out1; + goto err_out2; } if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) { rsp->hdr.Status = STATUS_ACCESS_DENIED; rc = -EACCES; - goto err_out1; + goto err_out2; } } @@ -2748,7 +2814,7 @@ int smb2_open(struct ksmbd_work *work) SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST, 4); if (IS_ERR(context)) { rc = PTR_ERR(context); - goto err_out1; + goto err_out2; } else if (context) { ksmbd_debug(SMB, "get query maximal access context\n"); @@ -2759,11 +2825,11 @@ int smb2_open(struct ksmbd_work *work) SMB2_CREATE_TIMEWARP_REQUEST, 4); if (IS_ERR(context)) { rc = PTR_ERR(context); - goto err_out1; + goto err_out2; } else if (context) { ksmbd_debug(SMB, "get timewarp context\n"); rc = -EBADF; - goto err_out1; + goto err_out2; } if (tcon->posix_extensions) { @@ -2771,7 +2837,7 @@ int smb2_open(struct ksmbd_work *work) SMB2_CREATE_TAG_POSIX, 16); if (IS_ERR(context)) { rc = PTR_ERR(context); - goto err_out1; + goto err_out2; } else if (context) { struct create_posix *posix = (struct create_posix *)context; @@ -2779,7 +2845,7 @@ int smb2_open(struct ksmbd_work *work) le32_to_cpu(context->DataLength) < sizeof(struct create_posix) - 4) { rc = -EINVAL; - goto err_out1; + goto err_out2; } ksmbd_debug(SMB, "get posix context\n"); @@ -2791,7 +2857,7 @@ int smb2_open(struct ksmbd_work *work) if (ksmbd_override_fsids(work)) { rc = -ENOMEM; - goto err_out1; + goto err_out2; } rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS, @@ -2966,7 +3032,7 @@ int smb2_open(struct ksmbd_work *work) } } - rc = ksmbd_query_inode_status(d_inode(path.dentry->d_parent)); + rc = ksmbd_query_inode_status(path.dentry->d_parent); if (rc == KSMBD_INODE_STATUS_PENDING_DELETE) { rc = -EBUSY; goto err_out; @@ -3080,7 +3146,8 @@ int smb2_open(struct ksmbd_work *work) idmap, &path, pntsd, - pntsd_size); + pntsd_size, + false); kfree(pntsd); if (rc) pr_err("failed to store ntacl in xattr : %d\n", @@ -3103,11 +3170,6 @@ int smb2_open(struct ksmbd_work *work) fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE)); - if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC && - !fp->attrib_only && !stream_name) { - smb_break_all_oplock(work, fp); - need_truncate = 1; - } /* fp should be searchable through ksmbd_inode.m_fp_list * after daccess, saccess, attrib_only, and stream are @@ -3123,23 +3185,43 @@ int smb2_open(struct ksmbd_work *work) goto err_out; } + if (file_present || created) + ksmbd_vfs_kern_path_unlock(&parent_path, &path); + + if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC && + !fp->attrib_only && !stream_name) { + smb_break_all_oplock(work, fp); + need_truncate = 1; + } + + req_op_level = req->RequestedOplockLevel; + if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) + lc = parse_lease_state(req, S_ISDIR(file_inode(filp)->i_mode)); + share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp); if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) || (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && !(conn->vals->capabilities & SMB2_GLOBAL_CAP_LEASING))) { if (share_ret < 0 && !S_ISDIR(file_inode(fp->filp)->i_mode)) { rc = share_ret; - goto err_out; + goto err_out1; } } else { if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) { + /* + * Compare parent lease using parent key. If there is no + * a lease that has same parent key, Send lease break + * notification. + */ + smb_send_parent_lease_break_noti(fp, lc); + req_op_level = smb2_map_lease_to_oplock(lc->req_state); ksmbd_debug(SMB, "lease req for(%s) req oplock state 0x%x, lease state 0x%x\n", name, req_op_level, lc->req_state); rc = find_same_lease_key(sess, fp->f_ci, lc); if (rc) - goto err_out; + goto err_out1; } else if (open_flags == O_RDONLY && (req_op_level == SMB2_OPLOCK_LEVEL_BATCH || req_op_level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)) @@ -3150,16 +3232,16 @@ int smb2_open(struct ksmbd_work *work) le32_to_cpu(req->hdr.Id.SyncId.TreeId), lc, share_ret); if (rc < 0) - goto err_out; + goto err_out1; } if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) ksmbd_fd_set_delete_on_close(fp, file_info); if (need_truncate) { - rc = smb2_create_truncate(&path); + rc = smb2_create_truncate(&fp->filp->f_path); if (rc) - goto err_out; + goto err_out1; } if (req->CreateContextsOffset) { @@ -3169,7 +3251,7 @@ int smb2_open(struct ksmbd_work *work) SMB2_CREATE_ALLOCATION_SIZE, 4); if (IS_ERR(az_req)) { rc = PTR_ERR(az_req); - goto err_out; + goto err_out1; } else if (az_req) { loff_t alloc_size; int err; @@ -3178,7 +3260,7 @@ int smb2_open(struct ksmbd_work *work) le32_to_cpu(az_req->ccontext.DataLength) < sizeof(struct create_alloc_size_req)) { rc = -EINVAL; - goto err_out; + goto err_out1; } alloc_size = le64_to_cpu(az_req->AllocationSize); ksmbd_debug(SMB, @@ -3196,7 +3278,7 @@ int smb2_open(struct ksmbd_work *work) context = smb2_find_context_vals(req, SMB2_CREATE_QUERY_ON_DISK_ID, 4); if (IS_ERR(context)) { rc = PTR_ERR(context); - goto err_out; + goto err_out1; } else if (context) { ksmbd_debug(SMB, "get query on disk id context\n"); query_disk_id = 1; @@ -3205,7 +3287,7 @@ int smb2_open(struct ksmbd_work *work) rc = ksmbd_vfs_getattr(&path, &stat); if (rc) - goto err_out; + goto err_out1; if (stat.result_mask & STATX_BTIME) fp->create_time = ksmbd_UnixTimeToNT(stat.btime); @@ -3248,7 +3330,7 @@ int smb2_open(struct ksmbd_work *work) rsp->CreateContextsOffset = 0; rsp->CreateContextsLength = 0; - inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/ + iov_len = offsetof(struct smb2_create_rsp, Buffer); /* If lease is request send lease context response */ if (opinfo && opinfo->is_lease) { @@ -3263,8 +3345,7 @@ int smb2_open(struct ksmbd_work *work) create_lease_buf(rsp->Buffer, opinfo->o_lease); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_lease_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_lease_size); + iov_len += conn->vals->create_lease_size; next_ptr = &lease_ccontext->Next; next_off = conn->vals->create_lease_size; } @@ -3284,8 +3365,7 @@ int smb2_open(struct ksmbd_work *work) le32_to_cpu(maximal_access)); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_mxac_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_mxac_size); + iov_len += conn->vals->create_mxac_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); next_ptr = &mxac_ccontext->Next; @@ -3303,8 +3383,7 @@ int smb2_open(struct ksmbd_work *work) stat.ino, tcon->id); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_disk_id_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_disk_id_size); + iov_len += conn->vals->create_disk_id_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); next_ptr = &disk_id_ccontext->Next; @@ -3318,8 +3397,7 @@ int smb2_open(struct ksmbd_work *work) fp); le32_add_cpu(&rsp->CreateContextsLength, conn->vals->create_posix_size); - inc_rfc1001_len(work->response_buf, - conn->vals->create_posix_size); + iov_len += conn->vals->create_posix_size; if (next_ptr) *next_ptr = cpu_to_le32(next_off); } @@ -3330,14 +3408,17 @@ int smb2_open(struct ksmbd_work *work) } err_out: - if (file_present || created) { - inode_unlock(d_inode(parent_path.dentry)); - path_put(&path); - path_put(&parent_path); - } - ksmbd_revert_fsids(work); + if (rc && (file_present || created)) + ksmbd_vfs_kern_path_unlock(&parent_path, &path); + err_out1: + ksmbd_revert_fsids(work); +err_out2: + if (!rc) { + ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); + } if (rc) { if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -4063,7 +4144,10 @@ int smb2_query_dir(struct ksmbd_work *work) rsp->OutputBufferOffset = cpu_to_le16(0); rsp->OutputBufferLength = cpu_to_le32(0); rsp->Buffer[0] = 0; - inc_rfc1001_len(work->response_buf, 9); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_query_directory_rsp)); + if (rc) + goto err_out; } else { no_buf_len: ((struct file_directory_info *) @@ -4075,7 +4159,11 @@ no_buf_len: rsp->StructureSize = cpu_to_le16(9); rsp->OutputBufferOffset = cpu_to_le16(72); rsp->OutputBufferLength = cpu_to_le32(d_info.data_count); - inc_rfc1001_len(work->response_buf, 8 + d_info.data_count); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_query_directory_rsp, Buffer) + + d_info.data_count); + if (rc) + goto err_out; } kfree(srch_ptr); @@ -4116,27 +4204,18 @@ err_out2: * @reqOutputBufferLength: max buffer length expected in command response * @rsp: query info response buffer contains output buffer length * @rsp_org: base response buffer pointer in case of chained response - * @infoclass_size: query info class response buffer size * * Return: 0 on success, otherwise error */ static int buffer_check_err(int reqOutputBufferLength, struct smb2_query_info_rsp *rsp, - void *rsp_org, int infoclass_size) + void *rsp_org) { if (reqOutputBufferLength < le32_to_cpu(rsp->OutputBufferLength)) { - if (reqOutputBufferLength < infoclass_size) { - pr_err("Invalid Buffer Size Requested\n"); - rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH; - *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr)); - return -EINVAL; - } - - ksmbd_debug(SMB, "Buffer Overflow\n"); - rsp->hdr.Status = STATUS_BUFFER_OVERFLOW; - *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr) + - reqOutputBufferLength); - rsp->OutputBufferLength = cpu_to_le32(reqOutputBufferLength); + pr_err("Invalid Buffer Size Requested\n"); + rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH; + *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr)); + return -EINVAL; } return 0; } @@ -4155,7 +4234,6 @@ static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp, sinfo->Directory = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_standard_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_standard_info)); } static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num, @@ -4169,7 +4247,6 @@ static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num, file_info->IndexNumber = cpu_to_le64(num | (1ULL << 63)); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_internal_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info)); } static int smb2_get_info_file_pipe(struct ksmbd_session *sess, @@ -4195,14 +4272,12 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess, case FILE_STANDARD_INFORMATION: get_standard_info_pipe(rsp, rsp_org); rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, rsp_org, - FILE_STANDARD_INFORMATION_SIZE); + rsp, rsp_org); break; case FILE_INTERNAL_INFORMATION: get_internal_info_pipe(rsp, id, rsp_org); rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, rsp_org, - FILE_INTERNAL_INFORMATION_SIZE); + rsp, rsp_org); break; default: ksmbd_debug(SMB, "smb2_info_file_pipe for %u not supported\n", @@ -4308,7 +4383,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) name_len -= XATTR_USER_PREFIX_LEN; - ptr = (char *)(&eainfo->name + name_len + 1); + ptr = eainfo->name + name_len + 1; buf_free_len -= (offsetof(struct smb2_ea_info, name) + name_len + 1); /* bailout if xattr can't fit in buf_free_len */ @@ -4370,7 +4445,6 @@ done: if (rsp_data_cnt == 0) rsp->hdr.Status = STATUS_NO_EAS_ON_FILE; rsp->OutputBufferLength = cpu_to_le32(rsp_data_cnt); - inc_rfc1001_len(rsp_org, rsp_data_cnt); out: kvfree(xattr_list); return rc; @@ -4385,7 +4459,6 @@ static void get_file_access_info(struct smb2_query_info_rsp *rsp, file_info->AccessFlags = fp->daccess; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_access_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_access_info)); } static int get_file_basic_info(struct smb2_query_info_rsp *rsp, @@ -4402,8 +4475,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, } basic_info = (struct smb2_file_basic_info *)rsp->Buffer; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); basic_info->LastAccessTime = cpu_to_le64(time); @@ -4415,7 +4488,6 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, basic_info->Pad1 = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_basic_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_basic_info)); return 0; } @@ -4428,7 +4500,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, struct kstat stat; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); @@ -4440,8 +4512,6 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_standard_info)); - inc_rfc1001_len(rsp_org, - sizeof(struct smb2_file_standard_info)); } static void get_file_alignment_info(struct smb2_query_info_rsp *rsp, @@ -4453,8 +4523,6 @@ static void get_file_alignment_info(struct smb2_query_info_rsp *rsp, file_info->AlignmentRequirement = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_alignment_info)); - inc_rfc1001_len(rsp_org, - sizeof(struct smb2_file_alignment_info)); } static int get_file_all_info(struct ksmbd_work *work, @@ -4482,7 +4550,7 @@ static int get_file_all_info(struct ksmbd_work *work, return PTR_ERR(filename); inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); ksmbd_debug(SMB, "filename = %s\n", filename); delete_pending = ksmbd_inode_pending_delete(fp); @@ -4518,7 +4586,6 @@ static int get_file_all_info(struct ksmbd_work *work, rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_all_info) + conv_len - 1); kfree(filename); - inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); return 0; } @@ -4541,7 +4608,6 @@ static void get_file_alternate_info(struct ksmbd_work *work, file_info->FileNameLength = cpu_to_le32(conv_len); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len); - inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); } static void get_file_stream_info(struct ksmbd_work *work, @@ -4559,8 +4625,8 @@ static void get_file_stream_info(struct ksmbd_work *work, int buf_free_len; struct smb2_query_info_req *req = ksmbd_req_buf_next(work); - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; buf_free_len = @@ -4641,7 +4707,6 @@ out: kvfree(xattr_list); rsp->OutputBufferLength = cpu_to_le32(nbytes); - inc_rfc1001_len(rsp_org, nbytes); } static void get_file_internal_info(struct smb2_query_info_rsp *rsp, @@ -4650,13 +4715,12 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp, struct smb2_file_internal_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_internal_info *)rsp->Buffer; file_info->IndexNumber = cpu_to_le64(stat.ino); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_internal_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info)); } static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, @@ -4676,7 +4740,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); file_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4692,7 +4756,6 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info->Reserved = cpu_to_le32(0); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ntwrk_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ntwrk_info)); return 0; } @@ -4704,7 +4767,6 @@ static void get_file_ea_info(struct smb2_query_info_rsp *rsp, void *rsp_org) file_info->EASize = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_ea_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ea_info)); } static void get_file_position_info(struct smb2_query_info_rsp *rsp, @@ -4716,7 +4778,6 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp, file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_pos_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_pos_info)); } static void get_file_mode_info(struct smb2_query_info_rsp *rsp, @@ -4728,7 +4789,6 @@ static void get_file_mode_info(struct smb2_query_info_rsp *rsp, file_info->Mode = fp->coption & FILE_MODE_INFO_MASK; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_mode_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_mode_info)); } static void get_file_compression_info(struct smb2_query_info_rsp *rsp, @@ -4737,8 +4797,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, struct smb2_file_comp_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_comp_info *)rsp->Buffer; file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9); @@ -4750,7 +4810,6 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_comp_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_comp_info)); } static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp, @@ -4769,11 +4828,10 @@ static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp, file_info->ReparseTag = 0; rsp->OutputBufferLength = cpu_to_le32(sizeof(struct smb2_file_attr_tag_info)); - inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_attr_tag_info)); return 0; } -static int find_file_posix_info(struct smb2_query_info_rsp *rsp, +static void find_file_posix_info(struct smb2_query_info_rsp *rsp, struct ksmbd_file *fp, void *rsp_org) { struct smb311_posix_qinfo *file_info; @@ -4786,11 +4844,11 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb311_posix_qinfo *)rsp->Buffer; file_info->CreationTime = cpu_to_le64(fp->create_time); - time = ksmbd_UnixTimeToNT(inode->i_atime); + time = ksmbd_UnixTimeToNT(inode_get_atime(inode)); file_info->LastAccessTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_mtime); + time = ksmbd_UnixTimeToNT(inode_get_mtime(inode)); file_info->LastWriteTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_ctime); + time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); file_info->ChangeTime = cpu_to_le64(time); file_info->DosAttributes = fp->f_ci->m_fattr; file_info->Inode = cpu_to_le64(inode->i_ino); @@ -4811,8 +4869,6 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]); rsp->OutputBufferLength = cpu_to_le32(out_buf_len); - inc_rfc1001_len(rsp_org, out_buf_len); - return out_buf_len; } static int smb2_get_info_file(struct ksmbd_work *work, @@ -4822,7 +4878,6 @@ static int smb2_get_info_file(struct ksmbd_work *work, struct ksmbd_file *fp; int fileinfoclass = 0; int rc = 0; - int file_infoclass_size; unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; if (test_share_config_flag(work->tcon->share_conf, @@ -4855,85 +4910,69 @@ static int smb2_get_info_file(struct ksmbd_work *work, switch (fileinfoclass) { case FILE_ACCESS_INFORMATION: get_file_access_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_ACCESS_INFORMATION_SIZE; break; case FILE_BASIC_INFORMATION: rc = get_file_basic_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_BASIC_INFORMATION_SIZE; break; case FILE_STANDARD_INFORMATION: get_file_standard_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_STANDARD_INFORMATION_SIZE; break; case FILE_ALIGNMENT_INFORMATION: get_file_alignment_info(rsp, work->response_buf); - file_infoclass_size = FILE_ALIGNMENT_INFORMATION_SIZE; break; case FILE_ALL_INFORMATION: rc = get_file_all_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_ALL_INFORMATION_SIZE; break; case FILE_ALTERNATE_NAME_INFORMATION: get_file_alternate_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_ALTERNATE_NAME_INFORMATION_SIZE; break; case FILE_STREAM_INFORMATION: get_file_stream_info(work, rsp, fp, work->response_buf); - file_infoclass_size = FILE_STREAM_INFORMATION_SIZE; break; case FILE_INTERNAL_INFORMATION: get_file_internal_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_INTERNAL_INFORMATION_SIZE; break; case FILE_NETWORK_OPEN_INFORMATION: rc = get_file_network_open_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_NETWORK_OPEN_INFORMATION_SIZE; break; case FILE_EA_INFORMATION: get_file_ea_info(rsp, work->response_buf); - file_infoclass_size = FILE_EA_INFORMATION_SIZE; break; case FILE_FULL_EA_INFORMATION: rc = smb2_get_ea(work, fp, req, rsp, work->response_buf); - file_infoclass_size = FILE_FULL_EA_INFORMATION_SIZE; break; case FILE_POSITION_INFORMATION: get_file_position_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_POSITION_INFORMATION_SIZE; break; case FILE_MODE_INFORMATION: get_file_mode_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_MODE_INFORMATION_SIZE; break; case FILE_COMPRESSION_INFORMATION: get_file_compression_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_COMPRESSION_INFORMATION_SIZE; break; case FILE_ATTRIBUTE_TAG_INFORMATION: rc = get_file_attribute_tag_info(rsp, fp, work->response_buf); - file_infoclass_size = FILE_ATTRIBUTE_TAG_INFORMATION_SIZE; break; case SMB_FIND_FILE_POSIX_INFO: if (!work->tcon->posix_extensions) { pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); rc = -EOPNOTSUPP; } else { - file_infoclass_size = find_file_posix_info(rsp, fp, - work->response_buf); + find_file_posix_info(rsp, fp, work->response_buf); } break; default: @@ -4943,8 +4982,7 @@ static int smb2_get_info_file(struct ksmbd_work *work, } if (!rc) rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, work->response_buf, - file_infoclass_size); + rsp, work->response_buf); ksmbd_fd_put(work, fp); return rc; } @@ -4960,7 +4998,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, struct kstatfs stfs; struct path path; int rc = 0, len; - int fs_infoclass_size = 0; if (!share->path) return -EIO; @@ -4990,8 +5027,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->DeviceType = cpu_to_le32(stfs.f_type); info->DeviceCharacteristics = cpu_to_le32(0x00000020); rsp->OutputBufferLength = cpu_to_le32(8); - inc_rfc1001_len(work->response_buf, 8); - fs_infoclass_size = FS_DEVICE_INFORMATION_SIZE; break; } case FS_ATTRIBUTE_INFORMATION: @@ -5020,8 +5055,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->FileSystemNameLen = cpu_to_le32(len); sz = sizeof(struct filesystem_attribute_info) - 2 + len; rsp->OutputBufferLength = cpu_to_le32(sz); - inc_rfc1001_len(work->response_buf, sz); - fs_infoclass_size = FS_ATTRIBUTE_INFORMATION_SIZE; break; } case FS_VOLUME_INFORMATION: @@ -5048,8 +5081,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->Reserved = 0; sz = sizeof(struct filesystem_vol_info) - 2 + len; rsp->OutputBufferLength = cpu_to_le32(sz); - inc_rfc1001_len(work->response_buf, sz); - fs_infoclass_size = FS_VOLUME_INFORMATION_SIZE; break; } case FS_SIZE_INFORMATION: @@ -5062,8 +5093,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->SectorsPerAllocationUnit = cpu_to_le32(1); info->BytesPerSector = cpu_to_le32(stfs.f_bsize); rsp->OutputBufferLength = cpu_to_le32(24); - inc_rfc1001_len(work->response_buf, 24); - fs_infoclass_size = FS_SIZE_INFORMATION_SIZE; break; } case FS_FULL_SIZE_INFORMATION: @@ -5079,8 +5108,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->SectorsPerAllocationUnit = cpu_to_le32(1); info->BytesPerSector = cpu_to_le32(stfs.f_bsize); rsp->OutputBufferLength = cpu_to_le32(32); - inc_rfc1001_len(work->response_buf, 32); - fs_infoclass_size = FS_FULL_SIZE_INFORMATION_SIZE; break; } case FS_OBJECT_ID_INFORMATION: @@ -5100,8 +5127,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->extended_info.rel_date = 0; memcpy(info->extended_info.version_string, "1.1.0", strlen("1.1.0")); rsp->OutputBufferLength = cpu_to_le32(64); - inc_rfc1001_len(work->response_buf, 64); - fs_infoclass_size = FS_OBJECT_ID_INFORMATION_SIZE; break; } case FS_SECTOR_SIZE_INFORMATION: @@ -5123,8 +5148,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->ByteOffsetForSectorAlignment = 0; info->ByteOffsetForPartitionAlignment = 0; rsp->OutputBufferLength = cpu_to_le32(28); - inc_rfc1001_len(work->response_buf, 28); - fs_infoclass_size = FS_SECTOR_SIZE_INFORMATION_SIZE; break; } case FS_CONTROL_INFORMATION: @@ -5145,8 +5168,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->DefaultQuotaLimit = cpu_to_le64(SMB2_NO_FID); info->Padding = 0; rsp->OutputBufferLength = cpu_to_le32(48); - inc_rfc1001_len(work->response_buf, 48); - fs_infoclass_size = FS_CONTROL_INFORMATION_SIZE; break; } case FS_POSIX_INFORMATION: @@ -5166,8 +5187,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info->TotalFileNodes = cpu_to_le64(stfs.f_files); info->FreeFileNodes = cpu_to_le64(stfs.f_ffree); rsp->OutputBufferLength = cpu_to_le32(56); - inc_rfc1001_len(work->response_buf, 56); - fs_infoclass_size = FS_POSIX_INFORMATION_SIZE; } break; } @@ -5176,8 +5195,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, return -EOPNOTSUPP; } rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), - rsp, work->response_buf, - fs_infoclass_size); + rsp, work->response_buf); path_put(&path); return rc; } @@ -5211,7 +5229,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work, secdesclen = sizeof(struct smb_ntsd); rsp->OutputBufferLength = cpu_to_le32(secdesclen); - inc_rfc1001_len(work->response_buf, secdesclen); return 0; } @@ -5256,7 +5273,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work, return rc; rsp->OutputBufferLength = cpu_to_le32(secdesclen); - inc_rfc1001_len(work->response_buf, secdesclen); return 0; } @@ -5295,6 +5311,14 @@ int smb2_query_info(struct ksmbd_work *work) rc = -EOPNOTSUPP; } + if (!rc) { + rsp->StructureSize = cpu_to_le16(9); + rsp->OutputBufferOffset = cpu_to_le16(72); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_query_info_rsp, Buffer) + + le32_to_cpu(rsp->OutputBufferLength)); + } + if (rc < 0) { if (rc == -EACCES) rsp->hdr.Status = STATUS_ACCESS_DENIED; @@ -5302,6 +5326,8 @@ int smb2_query_info(struct ksmbd_work *work) rsp->hdr.Status = STATUS_FILE_CLOSED; else if (rc == -EIO) rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + else if (rc == -ENOMEM) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; else if (rc == -EOPNOTSUPP || rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; smb2_set_err_rsp(work); @@ -5310,9 +5336,6 @@ int smb2_query_info(struct ksmbd_work *work) rc); return rc; } - rsp->StructureSize = cpu_to_le16(9); - rsp->OutputBufferOffset = cpu_to_le16(72); - inc_rfc1001_len(work->response_buf, 8); return 0; } @@ -5343,8 +5366,9 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work) rsp->AllocationSize = 0; rsp->EndOfFile = 0; rsp->Attributes = 0; - inc_rfc1001_len(work->response_buf, 60); - return 0; + + return ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_close_rsp)); } /** @@ -5429,11 +5453,11 @@ int smb2_close(struct ksmbd_work *work) rsp->EndOfFile = cpu_to_le64(inode->i_size); rsp->Attributes = fp->f_ci->m_fattr; rsp->CreationTime = cpu_to_le64(fp->create_time); - time = ksmbd_UnixTimeToNT(inode->i_atime); + time = ksmbd_UnixTimeToNT(inode_get_atime(inode)); rsp->LastAccessTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_mtime); + time = ksmbd_UnixTimeToNT(inode_get_mtime(inode)); rsp->LastWriteTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_ctime); + time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); rsp->ChangeTime = cpu_to_le64(time); ksmbd_fd_put(work, fp); } else { @@ -5449,15 +5473,17 @@ int smb2_close(struct ksmbd_work *work) err = ksmbd_close_fd(work, volatile_id); out: + if (!err) + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_close_rsp)); + if (err) { if (rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_FILE_CLOSED; smb2_set_err_rsp(work); - } else { - inc_rfc1001_len(work->response_buf, 60); } - return 0; + return err; } /** @@ -5475,8 +5501,7 @@ int smb2_echo(struct ksmbd_work *work) rsp->StructureSize = cpu_to_le16(4); rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); - return 0; + return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_echo_rsp)); } static int smb2_rename(struct ksmbd_work *work, @@ -5522,7 +5547,7 @@ static int smb2_rename(struct ksmbd_work *work, rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp), &fp->filp->f_path, xattr_stream_name, - NULL, 0, 0); + NULL, 0, 0, true); if (rc < 0) { pr_err("failed to store stream name in xattr: %d\n", rc); @@ -5615,11 +5640,9 @@ static int smb2_create_link(struct ksmbd_work *work, if (rc) rc = -EINVAL; out: - if (file_present) { - inode_unlock(d_inode(parent_path.dentry)); - path_put(&path); - path_put(&parent_path); - } + if (file_present) + ksmbd_vfs_kern_path_unlock(&parent_path, &path); + if (!IS_ERR(link_name)) kfree(link_name); kfree(pathname); @@ -5656,7 +5679,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, if (file_info->ChangeTime) attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); else - attrs.ia_ctime = inode->i_ctime; + attrs.ia_ctime = inode_get_ctime(inode); if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); @@ -5686,7 +5709,8 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da, + true); if (rc) ksmbd_debug(SMB, "failed to restore file attribute in EA\n"); @@ -5701,7 +5725,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, return -EACCES; inode_lock(inode); - inode->i_ctime = attrs.ia_ctime; + inode_set_ctime_to_ts(inode, attrs.ia_ctime); attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); @@ -5998,7 +6022,7 @@ static int smb2_set_info_sec(struct ksmbd_file *fp, int addition_info, fp->saccess |= FILE_SHARE_DELETE_LE; return set_info_sec(fp->conn, fp->tcon, &fp->filp->f_path, pntsd, - buf_len, false); + buf_len, false, true); } /** @@ -6068,7 +6092,10 @@ int smb2_set_info(struct ksmbd_work *work) goto err_out; rsp->StructureSize = cpu_to_le16(2); - inc_rfc1001_len(work->response_buf, 2); + rc = ksmbd_iov_pin_rsp(work, (void *)rsp, + sizeof(struct smb2_set_info_rsp)); + if (rc) + goto err_out; ksmbd_fd_put(work, fp); return 0; @@ -6115,28 +6142,36 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) id = req->VolatileFileId; - inc_rfc1001_len(work->response_buf, 16); rpc_resp = ksmbd_rpc_read(work->sess, id); if (rpc_resp) { + void *aux_payload_buf; + if (rpc_resp->flags != KSMBD_RPC_OK) { err = -EINVAL; goto out; } - work->aux_payload_buf = + aux_payload_buf = kvmalloc(rpc_resp->payload_sz, GFP_KERNEL); - if (!work->aux_payload_buf) { + if (!aux_payload_buf) { err = -ENOMEM; goto out; } - memcpy(work->aux_payload_buf, rpc_resp->payload, - rpc_resp->payload_sz); + memcpy(aux_payload_buf, rpc_resp->payload, rpc_resp->payload_sz); nbytes = rpc_resp->payload_sz; - work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4; - work->aux_payload_sz = nbytes; + err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer), + aux_payload_buf, nbytes); + if (err) + goto out; kvfree(rpc_resp); + } else { + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer)); + if (err) + goto out; } rsp->StructureSize = cpu_to_le16(17); @@ -6145,7 +6180,6 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = 0; rsp->Flags = 0; - inc_rfc1001_len(work->response_buf, nbytes); return 0; out: @@ -6219,13 +6253,8 @@ int smb2_read(struct ksmbd_work *work) int err = 0; bool is_rdma_channel = false; unsigned int max_read_size = conn->vals->max_read_size; - - WORK_BUFFERS(work, req, rsp); - if (work->next_smb2_rcv_hdr_off) { - work->send_no_response = 1; - err = -EOPNOTSUPP; - goto out; - } + unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; + void *aux_payload_buf; if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { @@ -6233,6 +6262,25 @@ int smb2_read(struct ksmbd_work *work) return smb2_read_pipe(work); } + if (work->next_smb2_rcv_hdr_off) { + req = ksmbd_req_buf_next(work); + rsp = ksmbd_resp_buf_next(work); + if (!has_file_id(req->VolatileFileId)) { + ksmbd_debug(SMB, "Compound request set FID = %llu\n", + work->compound_fid); + id = work->compound_fid; + pid = work->compound_pfid; + } + } else { + req = smb2_get_msg(work->request_buf); + rsp = smb2_get_msg(work->response_buf); + } + + if (!has_file_id(id)) { + id = req->VolatileFileId; + pid = req->PersistentFileId; + } + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || req->Channel == SMB2_CHANNEL_RDMA_V1) { is_rdma_channel = true; @@ -6255,7 +6303,7 @@ int smb2_read(struct ksmbd_work *work) goto out; } - fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); + fp = ksmbd_lookup_fd_slow(work, id, pid); if (!fp) { err = -ENOENT; goto out; @@ -6281,21 +6329,20 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", fp->filp, offset, length); - work->aux_payload_buf = kvzalloc(length, GFP_KERNEL); - if (!work->aux_payload_buf) { + aux_payload_buf = kvzalloc(length, GFP_KERNEL); + if (!aux_payload_buf) { err = -ENOMEM; goto out; } - nbytes = ksmbd_vfs_read(work, fp, length, &offset); + nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf); if (nbytes < 0) { err = nbytes; goto out; } if ((nbytes == 0 && length != 0) || nbytes < mincount) { - kvfree(work->aux_payload_buf); - work->aux_payload_buf = NULL; + kvfree(aux_payload_buf); rsp->hdr.Status = STATUS_END_OF_FILE; smb2_set_err_rsp(work); ksmbd_fd_put(work, fp); @@ -6308,11 +6355,10 @@ int smb2_read(struct ksmbd_work *work) if (is_rdma_channel == true) { /* write data to the client using rdma channel */ remain_bytes = smb2_read_rdma_channel(work, req, - work->aux_payload_buf, + aux_payload_buf, nbytes); - kvfree(work->aux_payload_buf); - work->aux_payload_buf = NULL; - + kvfree(aux_payload_buf); + aux_payload_buf = NULL; nbytes = 0; if (remain_bytes < 0) { err = (int)remain_bytes; @@ -6326,10 +6372,11 @@ int smb2_read(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = cpu_to_le32(remain_bytes); rsp->Flags = 0; - inc_rfc1001_len(work->response_buf, 16); - work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4; - work->aux_payload_sz = nbytes; - inc_rfc1001_len(work->response_buf, nbytes); + err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, + offsetof(struct smb2_read_rsp, Buffer), + aux_payload_buf, nbytes); + if (err) + goto out; ksmbd_fd_put(work, fp); return 0; @@ -6412,8 +6459,8 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(length); rsp->DataRemaining = 0; rsp->Reserved2 = 0; - inc_rfc1001_len(work->response_buf, 16); - return 0; + err = ksmbd_iov_pin_rsp(work, (void *)rsp, + offsetof(struct smb2_write_rsp, Buffer)); out: if (err) { rsp->hdr.Status = STATUS_INVALID_HANDLE; @@ -6569,7 +6616,9 @@ int smb2_write(struct ksmbd_work *work) rsp->DataLength = cpu_to_le32(nbytes); rsp->DataRemaining = 0; rsp->Reserved2 = 0; - inc_rfc1001_len(work->response_buf, 16); + err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_write_rsp, Buffer)); + if (err) + goto out; ksmbd_fd_put(work, fp); return 0; @@ -6616,15 +6665,11 @@ int smb2_flush(struct ksmbd_work *work) rsp->StructureSize = cpu_to_le16(4); rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); - return 0; + return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_flush_rsp)); out: - if (err) { - rsp->hdr.Status = STATUS_INVALID_HANDLE; - smb2_set_err_rsp(work); - } - + rsp->hdr.Status = STATUS_INVALID_HANDLE; + smb2_set_err_rsp(work); return err; } @@ -7029,10 +7074,6 @@ skip: ksmbd_debug(SMB, "would have to wait for getting lock\n"); - spin_lock(&work->conn->llist_lock); - list_add_tail(&smb_lock->clist, - &work->conn->lock_list); - spin_unlock(&work->conn->llist_lock); list_add(&smb_lock->llist, &rollback_list); argv = kmalloc(sizeof(void *), GFP_KERNEL); @@ -7046,6 +7087,7 @@ skip: smb2_remove_blocked_lock, argv); if (rc) { + kfree(argv); err = -ENOMEM; goto out; } @@ -7063,9 +7105,6 @@ skip: if (work->state != KSMBD_WORK_ACTIVE) { list_del(&smb_lock->llist); - spin_lock(&work->conn->llist_lock); - list_del(&smb_lock->clist); - spin_unlock(&work->conn->llist_lock); locks_free_lock(flock); if (work->state == KSMBD_WORK_CANCELLED) { @@ -7078,8 +7117,6 @@ skip: goto out; } - init_smb2_rsp_hdr(work); - smb2_set_err_rsp(work); rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED; kfree(smb_lock); @@ -7087,19 +7124,16 @@ skip: } list_del(&smb_lock->llist); - spin_lock(&work->conn->llist_lock); - list_del(&smb_lock->clist); - spin_unlock(&work->conn->llist_lock); release_async_work(work); goto retry; } else if (!rc) { + list_add(&smb_lock->llist, &rollback_list); spin_lock(&work->conn->llist_lock); list_add_tail(&smb_lock->clist, &work->conn->lock_list); list_add_tail(&smb_lock->flist, &fp->lock_list); spin_unlock(&work->conn->llist_lock); - list_add(&smb_lock->llist, &rollback_list); ksmbd_debug(SMB, "successful in taking lock\n"); } else { goto out; @@ -7114,7 +7148,10 @@ skip: ksmbd_debug(SMB, "successful in taking lock\n"); rsp->hdr.Status = STATUS_SUCCESS; rsp->Reserved = 0; - inc_rfc1001_len(work->response_buf, 4); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lock_rsp)); + if (err) + goto out; + ksmbd_fd_put(work, fp); return 0; @@ -7555,7 +7592,8 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, da.attr = le32_to_cpu(fp->f_ci->m_fattr); ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, - &fp->filp->f_path, &da); + &fp->filp->f_path, + &da, true); if (ret) fp->f_ci->m_fattr = old_fattr; } @@ -7910,9 +7948,9 @@ dup_ext_out: rsp->Reserved = cpu_to_le16(0); rsp->Flags = cpu_to_le32(0); rsp->Reserved2 = cpu_to_le32(0); - inc_rfc1001_len(work->response_buf, 48 + nbytes); - - return 0; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_ioctl_rsp) + nbytes); + if (!ret) + return ret; out: if (ret == -EACCES) @@ -8036,10 +8074,10 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) goto err_out; } - opinfo_put(opinfo); - ksmbd_fd_put(work, fp); opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); + opinfo_put(opinfo); + ksmbd_fd_put(work, fp); rsp->StructureSize = cpu_to_le16(24); rsp->OplockLevel = rsp_oplevel; @@ -8047,8 +8085,9 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) rsp->Reserved2 = 0; rsp->VolatileFid = volatile_id; rsp->PersistentFid = persistent_id; - inc_rfc1001_len(work->response_buf, 24); - return; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break)); + if (!ret) + return; err_out: opinfo->op_state = OPLOCK_STATE_NONE; @@ -8180,6 +8219,11 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) le32_to_cpu(req->LeaseState)); } + if (ret < 0) { + rsp->hdr.Status = err; + goto err_out; + } + lease_state = lease->state; opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); @@ -8187,22 +8231,17 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) wake_up_interruptible_all(&opinfo->oplock_brk); opinfo_put(opinfo); - if (ret < 0) { - rsp->hdr.Status = err; - goto err_out; - } - rsp->StructureSize = cpu_to_le16(36); rsp->Reserved = 0; rsp->Flags = 0; memcpy(rsp->LeaseKey, req->LeaseKey, 16); rsp->LeaseState = lease_state; rsp->LeaseDuration = 0; - inc_rfc1001_len(work->response_buf, 36); - return; + ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack)); + if (!ret) + return; err_out: - opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); atomic_dec(&opinfo->breaking_cnt); wake_up_interruptible_all(&opinfo->oplock_brk); @@ -8337,43 +8376,19 @@ int smb2_check_sign_req(struct ksmbd_work *work) void smb2_set_sign_rsp(struct ksmbd_work *work) { struct smb2_hdr *hdr; - struct smb2_hdr *req_hdr; char signature[SMB2_HMACSHA256_SIZE]; - struct kvec iov[2]; - size_t len; + struct kvec *iov; int n_vec = 1; - hdr = smb2_get_msg(work->response_buf); - if (work->next_smb2_rsp_hdr_off) - hdr = ksmbd_resp_buf_next(work); - - req_hdr = ksmbd_req_buf_next(work); - - if (!work->next_smb2_rsp_hdr_off) { - len = get_rfc1002_len(work->response_buf); - if (req_hdr->NextCommand) - len = ALIGN(len, 8); - } else { - len = get_rfc1002_len(work->response_buf) - - work->next_smb2_rsp_hdr_off; - len = ALIGN(len, 8); - } - - if (req_hdr->NextCommand) - hdr->NextCommand = cpu_to_le32(len); - + hdr = ksmbd_resp_buf_curr(work); hdr->Flags |= SMB2_FLAGS_SIGNED; memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); - iov[0].iov_base = (char *)&hdr->ProtocolId; - iov[0].iov_len = len; - - if (work->aux_payload_sz) { - iov[0].iov_len -= work->aux_payload_sz; - - iov[1].iov_base = work->aux_payload_buf; - iov[1].iov_len = work->aux_payload_sz; + if (hdr->Command == SMB2_READ) { + iov = &work->iov[work->iov_idx - 1]; n_vec++; + } else { + iov = &work->iov[work->iov_idx]; } if (!ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, n_vec, @@ -8449,29 +8464,14 @@ int smb3_check_sign_req(struct ksmbd_work *work) void smb3_set_sign_rsp(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - struct smb2_hdr *req_hdr, *hdr; + struct smb2_hdr *hdr; struct channel *chann; char signature[SMB2_CMACAES_SIZE]; - struct kvec iov[2]; + struct kvec *iov; int n_vec = 1; - size_t len; char *signing_key; - hdr = smb2_get_msg(work->response_buf); - if (work->next_smb2_rsp_hdr_off) - hdr = ksmbd_resp_buf_next(work); - - req_hdr = ksmbd_req_buf_next(work); - - if (!work->next_smb2_rsp_hdr_off) { - len = get_rfc1002_len(work->response_buf); - if (req_hdr->NextCommand) - len = ALIGN(len, 8); - } else { - len = get_rfc1002_len(work->response_buf) - - work->next_smb2_rsp_hdr_off; - len = ALIGN(len, 8); - } + hdr = ksmbd_resp_buf_curr(work); if (conn->binding == false && le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { @@ -8487,21 +8487,18 @@ void smb3_set_sign_rsp(struct ksmbd_work *work) if (!signing_key) return; - if (req_hdr->NextCommand) - hdr->NextCommand = cpu_to_le32(len); - hdr->Flags |= SMB2_FLAGS_SIGNED; memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); - iov[0].iov_base = (char *)&hdr->ProtocolId; - iov[0].iov_len = len; - if (work->aux_payload_sz) { - iov[0].iov_len -= work->aux_payload_sz; - iov[1].iov_base = work->aux_payload_buf; - iov[1].iov_len = work->aux_payload_sz; + + if (hdr->Command == SMB2_READ) { + iov = &work->iov[work->iov_idx - 1]; n_vec++; + } else { + iov = &work->iov[work->iov_idx]; } - if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, signature)) + if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, + signature)) memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE); } @@ -8568,45 +8565,22 @@ static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type) int smb3_encrypt_resp(struct ksmbd_work *work) { - char *buf = work->response_buf; - struct kvec iov[3]; + struct kvec *iov = work->iov; int rc = -ENOMEM; - int buf_size = 0, rq_nvec = 2 + (work->aux_payload_sz ? 1 : 0); + void *tr_buf; - if (ARRAY_SIZE(iov) < rq_nvec) - return -ENOMEM; - - work->tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL); - if (!work->tr_buf) + tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL); + if (!tr_buf) return rc; /* fill transform header */ - fill_transform_hdr(work->tr_buf, buf, work->conn->cipher_type); + fill_transform_hdr(tr_buf, work->response_buf, work->conn->cipher_type); - iov[0].iov_base = work->tr_buf; + iov[0].iov_base = tr_buf; iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4; - buf_size += iov[0].iov_len - 4; - - iov[1].iov_base = buf + 4; - iov[1].iov_len = get_rfc1002_len(buf); - if (work->aux_payload_sz) { - iov[1].iov_len = work->resp_hdr_sz - 4; + work->tr_buf = tr_buf; - iov[2].iov_base = work->aux_payload_buf; - iov[2].iov_len = work->aux_payload_sz; - buf_size += iov[2].iov_len; - } - buf_size += iov[1].iov_len; - work->resp_hdr_sz = iov[1].iov_len; - - rc = ksmbd_crypt_message(work, iov, rq_nvec, 1); - if (rc) - return rc; - - memmove(buf, iov[1].iov_base, iov[1].iov_len); - *(__be32 *)work->tr_buf = cpu_to_be32(buf_size); - - return rc; + return ksmbd_crypt_message(work, iov, work->iov_idx + 1, 1); } bool smb3_is_transform_hdr(void *buf) diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 2767c08a534a..d12cfd3b0927 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -361,7 +361,7 @@ struct smb2_ea_info { __u8 Flags; __u8 EaNameLength; __le16 EaValueLength; - char name[1]; + char name[]; /* optionally followed by value */ } __packed; /* level 15 Query */ diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index c2b75d898852..6691ae68af0c 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -319,12 +319,6 @@ static int init_smb1_rsp_hdr(struct ksmbd_work *work) struct smb_hdr *rsp_hdr = (struct smb_hdr *)work->response_buf; struct smb_hdr *rcv_hdr = (struct smb_hdr *)work->request_buf; - /* - * Remove 4 byte direct TCP header. - */ - *(__be32 *)work->response_buf = - cpu_to_be32(sizeof(struct smb_hdr) - 4); - rsp_hdr->Command = SMB_COM_NEGOTIATE; *(__le32 *)rsp_hdr->Protocol = SMB1_PROTO_NUMBER; rsp_hdr->Flags = SMBFLG_RESPONSE; @@ -372,11 +366,22 @@ static int smb1_allocate_rsp_buf(struct ksmbd_work *work) return 0; } +/** + * set_smb1_rsp_status() - set error type in smb response header + * @work: smb work containing smb response header + * @err: error code to set in response + */ +static void set_smb1_rsp_status(struct ksmbd_work *work, __le32 err) +{ + work->send_no_response = 1; +} + static struct smb_version_ops smb1_server_ops = { .get_cmd_val = get_smb1_cmd_val, .init_rsp_hdr = init_smb1_rsp_hdr, .allocate_rsp_buf = smb1_allocate_rsp_buf, .check_user_session = smb1_check_user_session, + .set_rsp_status = set_smb1_rsp_status, }; static int smb1_negotiate(struct ksmbd_work *work) @@ -560,10 +565,11 @@ static int smb_handle_negotiate(struct ksmbd_work *work) ksmbd_debug(SMB, "Unsupported SMB1 protocol\n"); - /* Add 2 byte bcc and 2 byte DialectIndex. */ - inc_rfc1001_len(work->response_buf, 4); - neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS; + if (ksmbd_iov_pin_rsp(work, (void *)neg_rsp, + sizeof(struct smb_negotiate_rsp) - 4)) + return -ENOMEM; + neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS; neg_rsp->hdr.WordCount = 1; neg_rsp->DialectIndex = cpu_to_le16(work->conn->dialect); neg_rsp->ByteCount = 0; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index e5e438bf5499..1164365533f0 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1107,6 +1107,7 @@ pass: struct smb_acl *pdacl; struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL; int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size; + int pntsd_alloc_size; if (parent_pntsd->osidoffset) { powner_sid = (struct smb_sid *)((char *)parent_pntsd + @@ -1119,9 +1120,10 @@ pass: pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4); } - pntsd = kzalloc(sizeof(struct smb_ntsd) + powner_sid_size + - pgroup_sid_size + sizeof(struct smb_acl) + - nt_size, GFP_KERNEL); + pntsd_alloc_size = sizeof(struct smb_ntsd) + powner_sid_size + + pgroup_sid_size + sizeof(struct smb_acl) + nt_size; + + pntsd = kzalloc(pntsd_alloc_size, GFP_KERNEL); if (!pntsd) { rc = -ENOMEM; goto free_aces_base; @@ -1136,6 +1138,27 @@ pass: pntsd->gsidoffset = parent_pntsd->gsidoffset; pntsd->dacloffset = parent_pntsd->dacloffset; + if ((u64)le32_to_cpu(pntsd->osidoffset) + powner_sid_size > + pntsd_alloc_size) { + rc = -EINVAL; + kfree(pntsd); + goto free_aces_base; + } + + if ((u64)le32_to_cpu(pntsd->gsidoffset) + pgroup_sid_size > + pntsd_alloc_size) { + rc = -EINVAL; + kfree(pntsd); + goto free_aces_base; + } + + if ((u64)le32_to_cpu(pntsd->dacloffset) + sizeof(struct smb_acl) + nt_size > + pntsd_alloc_size) { + rc = -EINVAL; + kfree(pntsd); + goto free_aces_base; + } + if (pntsd->osidoffset) { struct smb_sid *owner_sid = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); @@ -1162,7 +1185,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size, false); kfree(pntsd); } @@ -1354,7 +1377,7 @@ err_out: int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, - bool type_check) + bool type_check, bool get_write) { int rc; struct smb_fattr fattr = {{0}}; @@ -1414,13 +1437,13 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ ksmbd_vfs_remove_sd_xattrs(idmap, path); - ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len, + get_write); } out: posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); - mark_inode_dirty(inode); return rc; } diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h index 49a8c292bd2e..2b52861707d8 100644 --- a/fs/smb/server/smbacl.h +++ b/fs/smb/server/smbacl.h @@ -207,7 +207,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, __le32 *pdaccess, int uid); int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, const struct path *path, struct smb_ntsd *pntsd, int ntsd_len, - bool type_check); + bool type_check, bool get_write); void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c06efc020bd9..c5629a68c8b7 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -1241,14 +1241,12 @@ static int smb_direct_writev(struct ksmbd_transport *t, //FIXME: skip RFC1002 header.. buflen -= 4; - iov[0].iov_base += 4; - iov[0].iov_len -= 4; remaining_data_length = buflen; ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); - start = i = 0; + start = i = 1; buflen = 0; while (true) { buflen += iov[i].iov_len; @@ -1366,24 +1364,35 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, LIST_HEAD(msg_list); char *desc_buf; int credits_needed; - unsigned int desc_buf_len; - size_t total_length = 0; + unsigned int desc_buf_len, desc_num = 0; if (t->status != SMB_DIRECT_CS_CONNECTED) return -ENOTCONN; + if (buf_len > t->max_rdma_rw_size) + return -EINVAL; + /* calculate needed credits */ credits_needed = 0; desc_buf = buf; for (i = 0; i < desc_len / sizeof(*desc); i++) { + if (!buf_len) + break; + desc_buf_len = le32_to_cpu(desc[i].length); + if (!desc_buf_len) + return -EINVAL; + + if (desc_buf_len > buf_len) { + desc_buf_len = buf_len; + desc[i].length = cpu_to_le32(desc_buf_len); + buf_len = 0; + } credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len); desc_buf += desc_buf_len; - total_length += desc_buf_len; - if (desc_buf_len == 0 || total_length > buf_len || - total_length > t->max_rdma_rw_size) - return -EINVAL; + buf_len -= desc_buf_len; + desc_num++; } ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", @@ -1395,7 +1404,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, /* build rdma_rw_ctx for each descriptor */ desc_buf = buf; - for (i = 0; i < desc_len / sizeof(*desc); i++) { + for (i = 0; i < desc_num; i++) { msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); if (!msg) { @@ -2131,8 +2140,7 @@ static int smb_direct_ib_client_add(struct ib_device *ib_dev) if (ib_dev->node_type != RDMA_NODE_IB_CA) smb_direct_port = SMB_DIRECT_PORT_IWARP; - if (!ib_dev->ops.get_netdev || - !rdma_frwr_is_supported(&ib_dev->attrs)) + if (!rdma_frwr_is_supported(&ib_dev->attrs)) return 0; smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL); @@ -2232,17 +2240,38 @@ bool ksmbd_rdma_capable_netdev(struct net_device *netdev) for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { struct net_device *ndev; - ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev, - i + 1); - if (!ndev) - continue; + if (smb_dev->ib_dev->ops.get_netdev) { + ndev = smb_dev->ib_dev->ops.get_netdev( + smb_dev->ib_dev, i + 1); + if (!ndev) + continue; - if (ndev == netdev) { + if (ndev == netdev) { + dev_put(ndev); + rdma_capable = true; + goto out; + } dev_put(ndev); - rdma_capable = true; - goto out; + /* if ib_dev does not implement ops.get_netdev + * check for matching infiniband GUID in hw_addr + */ + } else if (netdev->type == ARPHRD_INFINIBAND) { + struct netdev_hw_addr *ha; + union ib_gid gid; + u32 port_num; + int ret; + + netdev_hw_addr_list_for_each( + ha, &netdev->dev_addrs) { + memcpy(&gid, ha->addr + 4, sizeof(gid)); + ret = ib_find_gid(smb_dev->ib_dev, &gid, + &port_num, NULL); + if (!ret) { + rdma_capable = true; + goto out; + } + } } - dev_put(ndev); } } out: diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c index 9ae676906ed3..43ed29ee44ea 100644 --- a/fs/smb/server/unicode.c +++ b/fs/smb/server/unicode.c @@ -11,49 +11,12 @@ #include <asm/unaligned.h> #include "glob.h" #include "unicode.h" -#include "uniupr.h" #include "smb_common.h" /* - * smb_utf16_bytes() - how long will a string be after conversion? - * @from: pointer to input string - * @maxbytes: don't go past this many bytes of input string - * @codepage: destination codepage - * - * Walk a utf16le string and return the number of bytes that the string will - * be after being converted to the given charset, not including any null - * termination required. Don't walk past maxbytes in the source buffer. - * - * Return: string length after conversion - */ -static int smb_utf16_bytes(const __le16 *from, int maxbytes, - const struct nls_table *codepage) -{ - int i; - int charlen, outlen = 0; - int maxwords = maxbytes / 2; - char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; - - for (i = 0; i < maxwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) - break; - - charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); - if (charlen > 0) - outlen += charlen; - else - outlen++; - } - - return outlen; -} - -/* * cifs_mapchar() - convert a host-endian char to proper char in codepage * @target: where converted character should be copied - * @src_char: 2 byte host-endian source character + * @from: host-endian source string * @cp: codepage to which character should be converted * @mapchar: should character be mapped according to mapchars mount option? * @@ -64,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes, * Return: string length after conversion */ static int -cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, +cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, bool mapchar) { int len = 1; + __u16 src_char; + + src_char = *from; if (!mapchar) goto cp_convert; @@ -105,12 +71,66 @@ out: cp_convert: len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); - if (len <= 0) { - *target = '?'; - len = 1; - } + if (len <= 0) + goto surrogate_pair; goto out; + +surrogate_pair: + /* convert SURROGATE_PAIR and IVS */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); + if (len <= 0) + goto unknown; + return len; + +unknown: + *target = '?'; + len = 1; + goto out; +} + +/* + * smb_utf16_bytes() - compute converted string length + * @from: pointer to input string + * @maxbytes: input string length + * @codepage: destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + * + * Return: string length after conversion + */ +static int smb_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i, j; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; + + for (i = 0; i < maxwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + for (j = 1; j <= 2; j++) { + if (i + j < maxwords) + ftmp[j] = get_unaligned_le16(&from[i + j]); + else + ftmp[j] = 0; + } + + charlen = cifs_mapchar(tmp, ftmp, codepage, 0); + if (charlen > 0) + outlen += charlen; + else + outlen++; + } + + return outlen; } /* @@ -140,12 +160,12 @@ cp_convert: static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, const struct nls_table *codepage, bool mapchar) { - int i, charlen, safelen; + int i, j, charlen, safelen; int outlen = 0; int nullsize = nls_nullsize(codepage); int fromwords = fromlen / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; + __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ /* * because the chars can be of varying widths, we need to take care @@ -156,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); for (i = 0; i < fromwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) break; + for (j = 1; j <= 2; j++) { + if (i + j < fromwords) + ftmp[j] = get_unaligned_le16(&from[i + j]); + else + ftmp[j] = 0; + } /* * check to see if converting this character might make the @@ -173,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, /* put converted char into 'to' buffer */ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); outlen += charlen; + + /* + * charlen (=bytes of UTF-8 for 1 character) + * 4bytes UTF-8(surrogate pair) is charlen=4 + * (4bytes UTF-16 code) + * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 + * (2 UTF-8 pairs divided to 2 UTF-16 pairs) + */ + if (charlen == 4) + i++; + else if (charlen >= 5) + /* 5-6bytes UTF-8 */ + i += 2; } /* properly null-terminate string */ @@ -307,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, char src_char; __le16 dst_char; wchar_t tmp; + wchar_t wchar_to[6]; /* UTF-16 */ + int ret; + unicode_t u; if (!mapchars) return smb_strtoUTF16(target, source, srclen, cp); @@ -349,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, * if no match, use question mark, which at least in * some cases serves as wild card */ - if (charlen < 1) { - dst_char = cpu_to_le16(0x003f); - charlen = 1; + if (charlen > 0) + goto ctoUTF16; + + /* convert SURROGATE_PAIR */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + if (*(source + i) & 0x80) { + charlen = utf8_to_utf32(source + i, 6, &u); + if (charlen < 0) + goto unknown; + } else + goto unknown; + ret = utf8s_to_utf16s(source + i, charlen, + UTF16_LITTLE_ENDIAN, + wchar_to, 6); + if (ret < 0) + goto unknown; + + i += charlen; + dst_char = cpu_to_le16(*wchar_to); + if (charlen <= 3) + /* 1-3bytes UTF-8 to 2bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + else if (charlen == 4) { + /* + * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 + * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 + * (charlen=3+4 or 4+4) + */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + } else if (charlen >= 5) { + /* 5-6bytes UTF-8 to 6bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 2)); + j++; + put_unaligned(dst_char, &target[j]); } + continue; + +unknown: + dst_char = cpu_to_le16(0x003f); + charlen = 1; } + +ctoUTF16: /* * character may take more than one byte in the source string, * but will take exactly two bytes in the target string diff --git a/fs/smb/server/unicode.h b/fs/smb/server/unicode.h index 076f6034a789..28c7c736f7bb 100644 --- a/fs/smb/server/unicode.h +++ b/fs/smb/server/unicode.h @@ -18,49 +18,14 @@ * This is a compressed table of upper and lower case conversion. * */ -#ifndef _CIFS_UNICODE_H -#define _CIFS_UNICODE_H +#ifndef _SMB_UNICODE_H +#define _SMB_UNICODE_H #include <asm/byteorder.h> #include <linux/types.h> #include <linux/nls.h> #include <linux/unicode.h> - -#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ - -/* - * Windows maps these to the user defined 16 bit Unicode range since they are - * reserved symbols (along with \ and /), otherwise illegal to store - * in filenames in NTFS - */ -#define UNI_ASTERISK ((__u16)('*' + 0xF000)) -#define UNI_QUESTION ((__u16)('?' + 0xF000)) -#define UNI_COLON ((__u16)(':' + 0xF000)) -#define UNI_GRTRTHAN ((__u16)('>' + 0xF000)) -#define UNI_LESSTHAN ((__u16)('<' + 0xF000)) -#define UNI_PIPE ((__u16)('|' + 0xF000)) -#define UNI_SLASH ((__u16)('\\' + 0xF000)) - -/* Just define what we want from uniupr.h. We don't want to define the tables - * in each source file. - */ -#ifndef UNICASERANGE_DEFINED -struct UniCaseRange { - wchar_t start; - wchar_t end; - signed char *table; -}; -#endif /* UNICASERANGE_DEFINED */ - -#ifndef UNIUPR_NOUPPER -extern signed char SmbUniUpperTable[512]; -extern const struct UniCaseRange SmbUniUpperRange[]; -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -extern signed char CifsUniLowerTable[512]; -extern const struct UniCaseRange CifsUniLowerRange[]; -#endif /* UNIUPR_NOLOWER */ +#include "../../nls/nls_ucs2_utils.h" #ifdef __KERNEL__ int smb_strtoUTF16(__le16 *to, const char *from, int len, @@ -73,286 +38,4 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename); #endif -/* - * UniStrcat: Concatenate the second string to the first - * - * Returns: - * Address of the first string - */ -static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) -{ - wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ - - while (*ucs1++) - /*NULL*/; /* To end of first string */ - ucs1--; /* Return to the null */ - while ((*ucs1++ = *ucs2++)) - /*NULL*/; /* copy string 2 over */ - return anchor; -} - -/* - * UniStrchr: Find a character in a string - * - * Returns: - * Address of first occurrence of character in string - * or NULL if the character is not in the string - */ -static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc) -{ - while ((*ucs != uc) && *ucs) - ucs++; - - if (*ucs == uc) - return (wchar_t *)ucs; - return NULL; -} - -/* - * UniStrcmp: Compare two strings - * - * Returns: - * < 0: First string is less than second - * = 0: Strings are equal - * > 0: First string is greater than second - */ -static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) -{ - while ((*ucs1 == *ucs2) && *ucs1) { - ucs1++; - ucs2++; - } - return (int)*ucs1 - (int)*ucs2; -} - -/* - * UniStrcpy: Copy a string - */ -static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) -{ - wchar_t *anchor = ucs1; /* save the start of result string */ - - while ((*ucs1++ = *ucs2++)) - /*NULL*/; - return anchor; -} - -/* - * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) - */ -static inline size_t UniStrlen(const wchar_t *ucs1) -{ - int i = 0; - - while (*ucs1++) - i++; - return i; -} - -/* - * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a - * string (length limited) - */ -static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen) -{ - int i = 0; - - while (*ucs1++) { - i++; - if (i >= maxlen) - break; - } - return i; -} - -/* - * UniStrncat: Concatenate length limited string - */ -static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; /* save pointer to string 1 */ - - while (*ucs1++) - /*NULL*/; - ucs1--; /* point to null terminator of s1 */ - while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ - ucs1++; - ucs2++; - } - *ucs1 = 0; /* Null terminate the result */ - return anchor; -} - -/* - * UniStrncmp: Compare length limited string - */ -static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == *ucs2) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int)*ucs1 - (int)*ucs2; -} - -/* - * UniStrncmp_le: Compare length limited string - native to little-endian - */ -static inline int -UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - if (!n) - return 0; /* Null strings are equal */ - while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { - ucs1++; - ucs2++; - } - return (int)*ucs1 - (int)__le16_to_cpu(*ucs2); -} - -/* - * UniStrncpy: Copy length limited string with pad - */ -static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = *ucs2++; - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrncpy_le: Copy length limited string with pad to little-endian - */ -static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) -{ - wchar_t *anchor = ucs1; - - while (n-- && *ucs2) /* Copy the strings */ - *ucs1++ = __le16_to_cpu(*ucs2++); - - n++; - while (n--) /* Pad with nulls */ - *ucs1++ = 0; - return anchor; -} - -/* - * UniStrstr: Find a string in a string - * - * Returns: - * Address of first match found - * NULL if no matching string is found - */ -static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) -{ - const wchar_t *anchor1 = ucs1; - const wchar_t *anchor2 = ucs2; - - while (*ucs1) { - if (*ucs1 == *ucs2) { - /* Partial match found */ - ucs1++; - ucs2++; - } else { - if (!*ucs2) /* Match found */ - return (wchar_t *)anchor1; - ucs1 = ++anchor1; /* No match */ - ucs2 = anchor2; - } - } - - if (!*ucs2) /* Both end together */ - return (wchar_t *)anchor1; /* Match found */ - return NULL; /* No match */ -} - -#ifndef UNIUPR_NOUPPER -/* - * UniToupper: Convert a unicode character to upper case - */ -static inline wchar_t UniToupper(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(SmbUniUpperTable)) { - /* Latin characters */ - return uc + SmbUniUpperTable[uc]; /* Use base tables */ - } - - rp = SmbUniUpperRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - return uc; /* Past last range */ -} - -/* - * UniStrupr: Upper case a unicode string - */ -static inline __le16 *UniStrupr(register __le16 *upin) -{ - register __le16 *up; - - up = upin; - while (*up) { /* For all characters */ - *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); - up++; - } - return upin; /* Return input pointer */ -} -#endif /* UNIUPR_NOUPPER */ - -#ifndef UNIUPR_NOLOWER -/* - * UniTolower: Convert a unicode character to lower case - */ -static inline wchar_t UniTolower(register wchar_t uc) -{ - register const struct UniCaseRange *rp; - - if (uc < sizeof(CifsUniLowerTable)) { - /* Latin characters */ - return uc + CifsUniLowerTable[uc]; /* Use base tables */ - } - - rp = CifsUniLowerRange; /* Use range tables */ - while (rp->start) { - if (uc < rp->start) /* Before start of range */ - return uc; /* Uppercase = input */ - if (uc <= rp->end) /* In range */ - return uc + rp->table[uc - rp->start]; - rp++; /* Try next range */ - } - return uc; /* Past last range */ -} - -/* - * UniStrlwr: Lower case a unicode string - */ -static inline wchar_t *UniStrlwr(register wchar_t *upin) -{ - register wchar_t *up; - - up = upin; - while (*up) { /* For all characters */ - *up = UniTolower(*up); - up++; - } - return upin; /* Return input pointer */ -} - -#endif - -#endif /* _CIFS_UNICODE_H */ +#endif /* _SMB_UNICODE_H */ diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 3d5d652153a5..4277750a6da1 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -97,6 +97,13 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, return -ENOENT; } + err = mnt_want_write(parent_path->mnt); + if (err) { + path_put(parent_path); + putname(filename); + return -ENOENT; + } + inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path->dentry, 0); if (IS_ERR(d)) @@ -123,6 +130,7 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, err_out: inode_unlock(d_inode(parent_path->dentry)); + mnt_drop_write(parent_path->mnt); path_put(parent_path); putname(filename); return -ENOENT; @@ -173,10 +181,6 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) return err; } - err = mnt_want_write(path.mnt); - if (err) - goto out_err; - mode |= S_IFREG; err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); @@ -186,9 +190,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } else { pr_err("File(%s): creation failed (err:%d)\n", name, err); } - mnt_drop_write(path.mnt); -out_err: done_path_create(&path, dentry); return err; } @@ -219,10 +221,6 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } - err = mnt_want_write(path.mnt); - if (err) - goto out_err2; - idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); @@ -233,21 +231,19 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); - goto out_err1; + goto out_err; } if (unlikely(d_is_negative(d))) { dput(d); err = -ENOENT; - goto out_err1; + goto out_err; } ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); dput(d); } -out_err1: - mnt_drop_write(path.mnt); -out_err2: +out_err: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -367,15 +363,15 @@ out: * @fid: file id of open file * @count: read byte count * @pos: file pos + * @rbuf: read data buffer * * Return: number of read bytes on success, otherwise error */ int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, - loff_t *pos) + loff_t *pos, char *rbuf) { struct file *filp = fp->filp; ssize_t nbytes = 0; - char *rbuf = work->aux_payload_buf; struct inode *inode = file_inode(filp); if (S_ISDIR(inode->i_mode)) @@ -463,7 +459,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, fp->stream.name, (void *)stream_buf, size, - 0); + 0, + true); if (err < 0) goto out; @@ -520,6 +517,9 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, } } + /* Reserve lease break for parent dir at closing time */ + fp->reserve_lease_break = true; + /* Do we need to break any of a levelII oplock? */ smb_break_all_levII_oplock(work, fp, 1); @@ -605,10 +605,6 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) goto out_err; } - err = mnt_want_write(path->mnt); - if (err) - goto out_err; - idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { err = vfs_rmdir(idmap, d_inode(parent), path->dentry); @@ -619,7 +615,6 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) if (err) ksmbd_debug(VFS, "unlink failed, err %d\n", err); } - mnt_drop_write(path->mnt); out_err: ksmbd_revert_fsids(work); @@ -665,16 +660,11 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } - err = mnt_want_write(newpath.mnt); - if (err) - goto out3; - err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) ksmbd_debug(VFS, "vfs_link failed err %d\n", err); - mnt_drop_write(newpath.mnt); out3: done_path_create(&newpath, dentry); @@ -732,7 +722,7 @@ retry: goto out3; } - parent_fp = ksmbd_lookup_fd_inode(d_inode(old_child->d_parent)); + parent_fp = ksmbd_lookup_fd_inode(old_child->d_parent); if (parent_fp) { if (parent_fp->daccess & FILE_DELETE_LE) { pr_err("parent dir is opened with delete access\n"); @@ -919,23 +909,27 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, /** * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value * @idmap: idmap of the relevant mount - * @dentry: dentry to set XATTR at + * @path: path of dentry to set XATTR at * @attr_name: xattr name for setxattr * @attr_value: xattr value to set * @attr_size: size of xattr value * @flags: destination buffer length + * @get_write: get write access to a mount * * Return: 0 on success, otherwise error */ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, const struct path *path, const char *attr_name, - void *attr_value, size_t attr_size, int flags) + void *attr_value, size_t attr_size, int flags, + bool get_write) { int err; - err = mnt_want_write(path->mnt); - if (err) - return err; + if (get_write == true) { + err = mnt_want_write(path->mnt); + if (err) + return err; + } err = vfs_setxattr(idmap, path->dentry, @@ -945,7 +939,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, flags); if (err) ksmbd_debug(VFS, "setxattr failed, err %d\n", err); - mnt_drop_write(path->mnt); + if (get_write == true) + mnt_drop_write(path->mnt); return err; } @@ -1194,9 +1189,10 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name, /** * ksmbd_vfs_kern_path_locked() - lookup a file and get path info - * @name: file path that is relative to share - * @flags: lookup flags - * @path: if lookup succeed, return path info + * @name: file path that is relative to share + * @flags: lookup flags + * @parent_path: if lookup succeed, return parent_path info + * @path: if lookup succeed, return path info * @caseless: caseless filename lookup * * Return: 0 on success, otherwise error @@ -1268,6 +1264,13 @@ out1: } if (!err) { + err = mnt_want_write(parent_path->mnt); + if (err) { + path_put(path); + path_put(parent_path); + return err; + } + err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry); if (err) { path_put(path); @@ -1277,6 +1280,14 @@ out1: return err; } +void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path) +{ + inode_unlock(d_inode(parent_path->dentry)); + mnt_drop_write(parent_path->mnt); + path_put(path); + path_put(parent_path); +} + struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, @@ -1431,7 +1442,8 @@ out: int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, const struct path *path, - struct smb_ntsd *pntsd, int len) + struct smb_ntsd *pntsd, int len, + bool get_write) { int rc; struct ndr sd_ndr = {0}, acl_ndr = {0}; @@ -1491,7 +1503,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, rc = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_SD, sd_ndr.data, - sd_ndr.offset, 0); + sd_ndr.offset, 0, get_write); if (rc < 0) pr_err("Failed to store XATTR ntacl :%d\n", rc); @@ -1580,7 +1592,8 @@ free_n_data: int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, const struct path *path, - struct xattr_dos_attrib *da) + struct xattr_dos_attrib *da, + bool get_write) { struct ndr n; int err; @@ -1590,7 +1603,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, return err; err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE, - (void *)n.data, n.offset, 0); + (void *)n.data, n.offset, 0, get_write); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); kfree(n.data); @@ -1659,7 +1672,8 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, u64 time; int rc; - generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat); + generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry), + ksmbd_kstat->kstat); time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); ksmbd_kstat->create_time = time; @@ -1861,10 +1875,6 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, } posix_state_to_acl(&acl_state, acls->a_entries); - rc = mnt_want_write(path->mnt); - if (rc) - goto out_err; - rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1876,9 +1886,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } - mnt_drop_write(path->mnt); -out_err: free_acl_state(&acl_state); posix_acl_release(acls); return rc; @@ -1908,10 +1916,6 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, } } - rc = mnt_want_write(path->mnt); - if (rc) - goto out_err; - rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1923,9 +1927,7 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } - mnt_drop_write(path->mnt); -out_err: posix_acl_release(acls); return rc; } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index 72f9fb4b48d1..cfe1c8092f23 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -76,8 +76,8 @@ void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); -int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, - size_t count, loff_t *pos); +int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, + loff_t *pos, char *rbuf); int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, char *buf, size_t count, loff_t *pos, bool sync, ssize_t *written); @@ -109,7 +109,8 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, int attr_name_len); int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, const struct path *path, const char *attr_name, - void *attr_value, size_t attr_size, int flags); + void *attr_value, size_t attr_size, int flags, + bool get_write); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, @@ -117,6 +118,7 @@ int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *parent_path, struct path *path, bool caseless); +void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, @@ -144,14 +146,16 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path) int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, const struct path *path, - struct smb_ntsd *pntsd, int len); + struct smb_ntsd *pntsd, int len, + bool get_write); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, const struct path *path, - struct xattr_dos_attrib *da); + struct xattr_dos_attrib *da, + bool get_write); int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index f41f8d6108ce..4e82ff627d12 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -66,14 +66,14 @@ static unsigned long inode_hash(struct super_block *sb, unsigned long hashval) return tmp & inode_hash_mask; } -static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode) +static struct ksmbd_inode *__ksmbd_inode_lookup(struct dentry *de) { struct hlist_head *head = inode_hashtable + - inode_hash(inode->i_sb, inode->i_ino); + inode_hash(d_inode(de)->i_sb, (unsigned long)de); struct ksmbd_inode *ci = NULL, *ret_ci = NULL; hlist_for_each_entry(ci, head, m_hash) { - if (ci->m_inode == inode) { + if (ci->m_de == de) { if (atomic_inc_not_zero(&ci->m_count)) ret_ci = ci; break; @@ -84,29 +84,30 @@ static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode) static struct ksmbd_inode *ksmbd_inode_lookup(struct ksmbd_file *fp) { - return __ksmbd_inode_lookup(file_inode(fp->filp)); + return __ksmbd_inode_lookup(fp->filp->f_path.dentry); } -static struct ksmbd_inode *ksmbd_inode_lookup_by_vfsinode(struct inode *inode) +struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d) { struct ksmbd_inode *ci; read_lock(&inode_hash_lock); - ci = __ksmbd_inode_lookup(inode); + ci = __ksmbd_inode_lookup(d); read_unlock(&inode_hash_lock); + return ci; } -int ksmbd_query_inode_status(struct inode *inode) +int ksmbd_query_inode_status(struct dentry *dentry) { struct ksmbd_inode *ci; int ret = KSMBD_INODE_STATUS_UNKNOWN; read_lock(&inode_hash_lock); - ci = __ksmbd_inode_lookup(inode); + ci = __ksmbd_inode_lookup(dentry); if (ci) { ret = KSMBD_INODE_STATUS_OK; - if (ci->m_flags & S_DEL_PENDING) + if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)) ret = KSMBD_INODE_STATUS_PENDING_DELETE; atomic_dec(&ci->m_count); } @@ -116,7 +117,7 @@ int ksmbd_query_inode_status(struct inode *inode) bool ksmbd_inode_pending_delete(struct ksmbd_file *fp) { - return (fp->f_ci->m_flags & S_DEL_PENDING); + return (fp->f_ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)); } void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp) @@ -143,7 +144,7 @@ void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp, static void ksmbd_inode_hash(struct ksmbd_inode *ci) { struct hlist_head *b = inode_hashtable + - inode_hash(ci->m_inode->i_sb, ci->m_inode->i_ino); + inode_hash(d_inode(ci->m_de)->i_sb, (unsigned long)ci->m_de); hlist_add_head(&ci->m_hash, b); } @@ -157,7 +158,6 @@ static void ksmbd_inode_unhash(struct ksmbd_inode *ci) static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp) { - ci->m_inode = file_inode(fp->filp); atomic_set(&ci->m_count, 1); atomic_set(&ci->op_count, 0); atomic_set(&ci->sop_count, 0); @@ -166,6 +166,7 @@ static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp) INIT_LIST_HEAD(&ci->m_fp_list); INIT_LIST_HEAD(&ci->m_op_list); rwlock_init(&ci->m_lock); + ci->m_de = fp->filp->f_path.dentry; return 0; } @@ -209,7 +210,7 @@ static void ksmbd_inode_free(struct ksmbd_inode *ci) kfree(ci); } -static void ksmbd_inode_put(struct ksmbd_inode *ci) +void ksmbd_inode_put(struct ksmbd_inode *ci) { if (atomic_dec_and_test(&ci->m_count)) ksmbd_inode_free(ci); @@ -333,6 +334,9 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) static struct ksmbd_file *ksmbd_fp_get(struct ksmbd_file *fp) { + if (fp->f_state != FP_INITED) + return NULL; + if (!atomic_inc_not_zero(&fp->refcount)) return NULL; return fp; @@ -382,15 +386,20 @@ int ksmbd_close_fd(struct ksmbd_work *work, u64 id) return 0; ft = &work->sess->file_table; - read_lock(&ft->lock); + write_lock(&ft->lock); fp = idr_find(ft->idr, id); if (fp) { set_close_state_blocked_works(fp); - if (!atomic_dec_and_test(&fp->refcount)) + if (fp->f_state != FP_INITED) fp = NULL; + else { + fp->f_state = FP_CLOSED; + if (!atomic_dec_and_test(&fp->refcount)) + fp = NULL; + } } - read_unlock(&ft->lock); + write_unlock(&ft->lock); if (!fp) return -EINVAL; @@ -480,12 +489,15 @@ struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid) return fp; } -struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode) +struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry) { struct ksmbd_file *lfp; struct ksmbd_inode *ci; + struct inode *inode = d_inode(dentry); - ci = ksmbd_inode_lookup_by_vfsinode(inode); + read_lock(&inode_hash_lock); + ci = __ksmbd_inode_lookup(dentry); + read_unlock(&inode_hash_lock); if (!ci) return NULL; @@ -570,6 +582,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) fp->tcon = work->tcon; fp->volatile_id = KSMBD_NO_FID; fp->persistent_id = KSMBD_NO_FID; + fp->f_state = FP_NEW; fp->f_ci = ksmbd_inode_get(fp); if (!fp->f_ci) { @@ -591,6 +604,17 @@ err_out: return ERR_PTR(ret); } +void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state) +{ + if (!fp) + return; + + write_lock(&ft->lock); + fp->f_state = state; + write_unlock(&ft->lock); +} + static int __close_file_table_ids(struct ksmbd_file_table *ft, struct ksmbd_tree_connect *tcon, diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index fcb13413fa8d..a528f0cc775a 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -51,7 +51,7 @@ struct ksmbd_inode { atomic_t op_count; /* opinfo count for streams */ atomic_t sop_count; - struct inode *m_inode; + struct dentry *m_de; unsigned int m_flags; struct hlist_node m_hash; struct list_head m_fp_list; @@ -60,6 +60,12 @@ struct ksmbd_inode { __le32 m_fattr; }; +enum { + FP_NEW = 0, + FP_INITED, + FP_CLOSED +}; + struct ksmbd_file { struct file *filp; u64 persistent_id; @@ -98,6 +104,8 @@ struct ksmbd_file { /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; int dot_dotdot[2]; + unsigned int f_state; + bool reserve_lease_break; }; static inline void set_ctx_actor(struct dir_context *ctx, @@ -131,9 +139,11 @@ struct ksmbd_file *ksmbd_lookup_foreign_fd(struct ksmbd_work *work, u64 id); struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id, u64 pid); void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp); +struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d); +void ksmbd_inode_put(struct ksmbd_inode *ci); struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id); struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid); -struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode); +struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry); unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp); struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp); void ksmbd_close_tree_conn_fds(struct ksmbd_work *work); @@ -142,6 +152,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); int ksmbd_init_global_file_table(void); void ksmbd_free_global_file_table(void); void ksmbd_set_fd_limit(unsigned long limit); +void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state); /* * INODE hash @@ -155,7 +167,7 @@ enum KSMBD_INODE_STATUS { KSMBD_INODE_STATUS_PENDING_DELETE, }; -int ksmbd_query_inode_status(struct inode *inode); +int ksmbd_query_inode_status(struct dentry *dentry); bool ksmbd_inode_pending_delete(struct ksmbd_file *fp); void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp); void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp); diff --git a/fs/splice.c b/fs/splice.c index 3e2a31e1ce6a..d983d375ff11 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, */ folio_wait_writeback(folio); - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* @@ -120,17 +119,17 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = page_folio(buf->page); int err; - if (!PageUptodate(page)) { - lock_page(page); + if (!folio_test_uptodate(folio)) { + folio_lock(folio); /* - * Page got truncated/unhashed. This will cause a 0-byte + * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ - if (!page->mapping) { + if (!folio->mapping) { err = -ENODATA; goto error; } @@ -138,20 +137,18 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, /* * Uh oh, read-error from disk. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } - /* - * Page is ok afterall, we are done. - */ - unlock_page(page); + /* Folio is ok after all, we are done */ + folio_unlock(folio); } return 0; error: - unlock_page(page); + folio_unlock(folio); return err; } @@ -1269,10 +1266,8 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; - return splice_pipe_to_pipe(ipipe, opipe, len, flags); - } - - if (ipipe) { + ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); + } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { @@ -1297,18 +1292,11 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); - if (ret > 0) - fsnotify_modify(out); - if (!off_out) out->f_pos = offset; else *off_out = offset; - - return ret; - } - - if (opipe) { + } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { @@ -1324,18 +1312,25 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = splice_file_to_pipe(in, opipe, &offset, len, flags); - if (ret > 0) - fsnotify_access(in); - if (!off_in) in->f_pos = offset; else *off_in = offset; + } else { + ret = -EINVAL; + } - return ret; + if (ret > 0) { + /* + * Generate modify out before access in: + * do_splice_from() may've already sent modify out, + * and this ensures the events get merged. + */ + fsnotify_modify(out); + fsnotify_access(in); } - return -EINVAL; + return ret; } static long __do_splice(struct file *in, loff_t __user *off_in, @@ -1464,6 +1459,9 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, pipe_unlock(pipe); } + if (ret > 0) + fsnotify_access(file); + return ret; } @@ -1493,8 +1491,10 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); - if (ret > 0) + if (ret > 0) { wakeup_pipe_readers(pipe); + fsnotify_modify(file); + } return ret; } @@ -1928,6 +1928,11 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) } } + if (ret > 0) { + fsnotify_access(in); + fsnotify_modify(out); + } + return ret; } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 581ce9519339..2dc730800f44 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -321,7 +321,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, TRACE("Block @ 0x%llx, %scompressed size %d\n", index - 2, compressed ? "" : "un", length); } - if (length < 0 || length > output->length || + if (length <= 0 || length > output->length || (index + length) > msblk->bytes_used) { res = -EIO; goto out; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 723763746238..62972f0ff868 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -173,6 +173,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb, const struct export_operations squashfs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = squashfs_fh_to_dentry, .fh_to_parent = squashfs_fh_to_parent, .get_parent = squashfs_get_parent diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 24463145b351..aa3411354e66 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); - inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); - inode->i_atime.tv_sec = inode->i_mtime.tv_sec; - inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; + inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); + inode_set_atime(inode, inode_get_mtime_sec(inode), 0); + inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index a6164fdf9435..5a756e6790b5 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -111,4 +111,4 @@ extern const struct address_space_operations squashfs_symlink_aops; extern const struct inode_operations squashfs_symlink_inode_ops; /* xattr.c */ -extern const struct xattr_handler *squashfs_xattr_handlers[]; +extern const struct xattr_handler * const squashfs_xattr_handlers[]; diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index e1e3f3dd5a06..ce6608cabd49 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -262,7 +262,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int type) } } -const struct xattr_handler *squashfs_xattr_handlers[] = { +const struct xattr_handler * const squashfs_xattr_handlers[] = { &squashfs_xattr_user_handler, &squashfs_xattr_trusted_handler, &squashfs_xattr_security_handler, diff --git a/fs/stack.c b/fs/stack.c index c9830924eb12..f18920119944 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -66,9 +66,9 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) dest->i_uid = src->i_uid; dest->i_gid = src->i_gid; dest->i_rdev = src->i_rdev; - dest->i_atime = src->i_atime; - dest->i_mtime = src->i_mtime; - dest->i_ctime = src->i_ctime; + inode_set_atime_to_ts(dest, inode_get_atime(src)); + inode_set_mtime_to_ts(dest, inode_get_mtime(src)); + inode_set_ctime_to_ts(dest, inode_get_ctime(src)); dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; set_nlink(dest, src->i_nlink); diff --git a/fs/stat.c b/fs/stat.c index 7c238da22ef0..f721d26ec3f7 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -28,9 +28,10 @@ /** * generic_fillattr - Fill in the basic attributes from the inode struct - * @idmap: idmap of the mount the inode was found from - * @inode: Inode to use as the source - * @stat: Where to fill in the attributes + * @idmap: idmap of the mount the inode was found from + * @request_mask: statx request_mask + * @inode: Inode to use as the source + * @stat: Where to fill in the attributes * * Fill in the basic attributes in the kstat structure from data that's to be * found on the VFS inode structure. This is the default if no getattr inode @@ -42,8 +43,8 @@ * uid and gid filds. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs @nop_mnt_idmap. */ -void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, - struct kstat *stat) +void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, + struct inode *inode, struct kstat *stat) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); @@ -56,11 +57,17 @@ void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; + stat->atime = inode_get_atime(inode); + stat->mtime = inode_get_mtime(inode); + stat->ctime = inode_get_ctime(inode); stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; + + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + } EXPORT_SYMBOL(generic_fillattr); @@ -123,17 +130,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->result_mask |= STATX_CHANGE_COOKIE; - stat->change_cookie = inode_query_iversion(inode); - } - idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, - request_mask, query_flags); + request_mask, + query_flags | AT_GETATTR_NOSEC); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -164,6 +167,9 @@ int vfs_getattr(const struct path *path, struct kstat *stat, { int retval; + if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC)) + return -EPERM; + retval = security_inode_getattr(path); if (retval) return retval; @@ -272,6 +278,23 @@ int vfs_fstatat(int dfd, const char __user *filename, int statx_flags = flags | AT_NO_AUTOMOUNT; struct filename *name; + /* + * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH) + * + * If AT_EMPTY_PATH is set, we expect the common case to be that + * empty path, and avoid doing all the extra pathname work. + */ + if (dfd >= 0 && flags == AT_EMPTY_PATH) { + char c; + + ret = get_user(c, filename); + if (unlikely(ret)) + return ret; + + if (likely(!c)) + return vfs_fstat(dfd, stat); + } + name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL); ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS); putname(name); @@ -363,12 +386,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat #ifdef __ARCH_WANT_NEW_STAT -#if BITS_PER_LONG == 32 -# define choose_32_64(a,b) a -#else -# define choose_32_64(a,b) b -#endif - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif diff --git a/fs/super.c b/fs/super.c index e781226e2880..076392396e72 100644 --- a/fs/super.c +++ b/fs/super.c @@ -39,7 +39,7 @@ #include <uapi/linux/mount.h> #include "internal.h" -static int thaw_super_locked(struct super_block *sb); +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); @@ -50,11 +50,135 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_internal", }; +static inline void __super_lock(struct super_block *sb, bool excl) +{ + if (excl) + down_write(&sb->s_umount); + else + down_read(&sb->s_umount); +} + +static inline void super_unlock(struct super_block *sb, bool excl) +{ + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); +} + +static inline void __super_lock_excl(struct super_block *sb) +{ + __super_lock(sb, true); +} + +static inline void super_unlock_excl(struct super_block *sb) +{ + super_unlock(sb, true); +} + +static inline void super_unlock_shared(struct super_block *sb) +{ + super_unlock(sb, false); +} + +static inline bool wait_born(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with smp_store_release() in super_wake() and ensures + * that we see SB_BORN or SB_DYING after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & (SB_BORN | SB_DYING); +} + +/** + * super_lock - wait for superblock to become ready and lock it + * @sb: superblock to wait for + * @excl: whether exclusive access is required + * + * If the superblock has neither passed through vfs_get_tree() or + * generic_shutdown_super() yet wait for it to happen. Either superblock + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're + * woken and we'll see SB_DYING. + * + * The caller must have acquired a temporary reference on @sb->s_count. + * + * Return: This returns true if SB_BORN was set, false if SB_DYING was + * set. The function acquires s_umount and returns with it held. + */ +static __must_check bool super_lock(struct super_block *sb, bool excl) +{ + + lockdep_assert_not_held(&sb->s_umount); + +relock: + __super_lock(sb, excl); + + /* + * Has gone through generic_shutdown_super() in the meantime. + * @sb->s_root is NULL and @sb->s_active is 0. No one needs to + * grab a reference to this. Tell them so. + */ + if (sb->s_flags & SB_DYING) + return false; + + /* Has called ->get_tree() successfully. */ + if (sb->s_flags & SB_BORN) + return true; + + super_unlock(sb, excl); + + /* wait until the superblock is ready or dying */ + wait_var_event(&sb->s_flags, wait_born(sb)); + + /* + * Neither SB_BORN nor SB_DYING are ever unset so we never loop. + * Just reacquire @sb->s_umount for the caller. + */ + goto relock; +} + +/* wait and acquire read-side of @sb->s_umount */ +static inline bool super_lock_shared(struct super_block *sb) +{ + return super_lock(sb, false); +} + +/* wait and acquire write-side of @sb->s_umount */ +static inline bool super_lock_excl(struct super_block *sb) +{ + return super_lock(sb, true); +} + +/* wake waiters */ +#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) +static void super_wake(struct super_block *sb, unsigned int flag) +{ + WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); + WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); + + /* + * Pairs with smp_load_acquire() in super_lock() to make sure + * all initializations in the superblock are seen by the user + * seeing SB_BORN sent. + */ + smp_store_release(&sb->s_flags, sb->s_flags | flag); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees SB_BORN set or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + wake_up_var(&sb->s_flags); +} + /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, @@ -67,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, long dentries; long inodes; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want @@ -76,7 +200,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!trylock_super(sb)) + if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) @@ -110,7 +234,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, freed += sb->s_op->free_cached_objects(sb, sc); } - up_read(&sb->s_umount); + super_unlock_shared(sb); return freed; } @@ -120,20 +244,20 @@ static unsigned long super_cache_count(struct shrinker *shrink, struct super_block *sb; long total_objects = 0; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* - * We don't call trylock_super() here as it is a scalability bottleneck, - * so we're exposed to partial setup state. The shrinker rwsem does not - * protect filesystem operations backing list_lru_shrink_count() or - * s_op->nr_cached_objects(). Counts can change between - * super_cache_count and super_cache_scan, so we really don't need locks - * here. + * We don't call super_trylock_shared() here as it is a scalability + * bottleneck, so we're exposed to partial setup state. The shrinker + * rwsem does not protect filesystem operations backing + * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can + * change between super_cache_count and super_cache_scan, so we really + * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it - * is dangerous to access it. trylock_super() uses a SB_BORN check to - * avoid this situation, so do the same here. The memory barrier is + * is dangerous to access it. super_trylock_shared() uses a SB_BORN check + * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) @@ -176,13 +300,13 @@ static void destroy_unused_super(struct super_block *s) { if (!s) return; - up_write(&s->s_umount); + super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); - free_prealloced_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } @@ -259,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; - s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.scan_objects = super_cache_scan; - s->s_shrink.count_objects = super_cache_count; - s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; - if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) + s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, + "sb-%s", type->name); + if (!s->s_shrink) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) + + s->s_shrink->scan_objects = super_cache_scan; + s->s_shrink->count_objects = super_cache_count; + s->s_shrink->batch = 1024; + s->s_shrink->private_data = s; + + if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) + if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; @@ -310,6 +437,33 @@ void put_super(struct super_block *sb) spin_unlock(&sb_lock); } +static void kill_super_notify(struct super_block *sb) +{ + lockdep_assert_not_held(&sb->s_umount); + + /* already notified earlier */ + if (sb->s_flags & SB_DEAD) + return; + + /* + * Remove it from @fs_supers so it isn't found by new + * sget{_fc}() walkers anymore. Any concurrent mounter still + * managing to grab a temporary reference is guaranteed to + * already see SB_DYING and will wait until we notify them about + * SB_DEAD. + */ + spin_lock(&sb_lock); + hlist_del_init(&sb->s_instances); + spin_unlock(&sb_lock); + + /* + * Let concurrent mounts know that this thing is really dead. + * We don't need @sb->s_umount here as every concurrent caller + * will see SB_DYING and either discard the superblock or wait + * for SB_DEAD. + */ + super_wake(sb, SB_DEAD); +} /** * deactivate_locked_super - drop an active reference to superblock @@ -326,9 +480,11 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - unregister_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); fs->kill_sb(s); + kill_super_notify(s); + /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy @@ -340,7 +496,7 @@ void deactivate_locked_super(struct super_block *s) put_filesystem(fs); put_super(s); } else { - up_write(&s->s_umount); + super_unlock_excl(s); } } @@ -357,7 +513,7 @@ EXPORT_SYMBOL(deactivate_locked_super); void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { - down_write(&s->s_umount); + __super_lock_excl(s); deactivate_locked_super(s); } } @@ -379,20 +535,61 @@ EXPORT_SYMBOL(deactivate_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { + bool born; + s->s_count++; spin_unlock(&sb_lock); - down_write(&s->s_umount); - if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { + born = super_lock_excl(s); + if (born && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } - up_write(&s->s_umount); + super_unlock_excl(s); put_super(s); return 0; } +static inline bool wait_dead(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with memory barrier in super_wake() and ensures + * that we see SB_DEAD after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & SB_DEAD; +} + +/** + * grab_super_dead - acquire an active reference to a superblock + * @sb: superblock to acquire + * + * Acquire a temporary reference on a superblock and try to trade it for + * an active reference. This is used in sget{_fc}() to wait for a + * superblock to either become SB_BORN or for it to pass through + * sb->kill() and be marked as SB_DEAD. + * + * Return: This returns true if an active reference could be acquired, + * false if not. + */ +static bool grab_super_dead(struct super_block *sb) +{ + + sb->s_count++; + if (grab_super(sb)) { + put_super(sb); + lockdep_assert_held(&sb->s_umount); + return true; + } + wait_var_event(&sb->s_flags, wait_dead(sb)); + lockdep_assert_not_held(&sb->s_umount); + put_super(sb); + return false; +} + /* - * trylock_super - try to grab ->s_umount shared + * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we @@ -408,13 +605,13 @@ static int grab_super(struct super_block *s) __releases(sb_lock) * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ -bool trylock_super(struct super_block *sb) +bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { - if (!hlist_unhashed(&sb->s_instances) && - sb->s_root && (sb->s_flags & SB_BORN)) + if (!(sb->s_flags & SB_DYING) && sb->s_root && + (sb->s_flags & SB_BORN)) return true; - up_read(&sb->s_umount); + super_unlock_shared(sb); } return false; @@ -439,13 +636,13 @@ bool trylock_super(struct super_block *sb) void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); - down_write(&sb->s_umount); + __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; - up_write(&sb->s_umount); + super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); @@ -517,11 +714,17 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb->s_inode_list_lock); } } - spin_lock(&sb_lock); - /* should be initialized for __put_super_and_need_restart() */ - hlist_del_init(&sb->s_instances); - spin_unlock(&sb_lock); - up_write(&sb->s_umount); + /* + * Broadcast to everyone that grabbed a temporary reference to this + * superblock before we removed it from @fs_supers that the superblock + * is dying. Every walker of @fs_supers outside of sget{_fc}() will now + * discard this superblock and treat it as dead. + * + * We leave the superblock on @fs_supers so it can be found by + * sget{_fc}() until we passed sb->kill_sb(). + */ + super_wake(sb, SB_DYING); + super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); @@ -546,17 +749,31 @@ bool mount_capable(struct fs_context *fc) * @test: Comparison callback * @set: Setup callback * - * Find or create a superblock using the parameters stored in the filesystem - * context and the two callback functions. + * Create a new superblock or find an existing one. + * + * The @test callback is used to find a matching existing superblock. + * Whether or not the requested parameters in @fc are taken into account + * is specific to the @test callback that is used. They may even be + * completely ignored. + * + * If an extant superblock is matched, it will be returned unless: + * + * (1) the namespace the filesystem context @fc and the extant + * superblock's namespace differ * - * If an extant superblock is matched, then that will be returned with an - * elevated reference count that the caller must transfer or discard. + * (2) the filesystem context @fc has requested that reusing an extant + * superblock is not allowed + * + * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic - * initialisation will be performed (s_type, s_fs_info and s_id will be set and - * the set() callback will be invoked), the superblock will be published and it - * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE - * as yet unset. + * initialisation will be performed (s_type, s_fs_info and s_id will be + * set and the @set callback will be invoked), the superblock will be + * published and it will be returned in a partially constructed state + * with SB_BORN and SB_ACTIVE as yet unset. + * + * Return: On success, an extant or newly created superblock is + * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), @@ -595,20 +812,29 @@ retry: s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + /* + * Make the superblock visible on @super_blocks and @fs_supers. + * It's in a nascent state and users should wait on SB_BORN or + * SB_DYING to be set. + */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; share_extant_sb: - if (user_ns != old->s_user_ns) { + if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); + if (fc->exclusive) + warnfc(fc, "reusing existing filesystem not allowed"); + else + warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; @@ -652,7 +878,7 @@ retry: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; @@ -678,14 +904,14 @@ retry: hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { - up_read(&sb->s_umount); + super_unlock_shared(sb); put_super(sb); } @@ -693,7 +919,7 @@ EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { - up_write(&sb->s_umount); + super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); @@ -704,7 +930,8 @@ static void __iterate_supers(void (*f)(struct super_block *)) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) + /* Pairs with memory marrier in super_wake(). */ + if (smp_load_acquire(&sb->s_flags) & SB_DYING) continue; sb->s_count++; spin_unlock(&sb_lock); @@ -734,15 +961,15 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; + bool born; + sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); - up_read(&sb->s_umount); + super_unlock_shared(sb); spin_lock(&sb_lock); if (p) @@ -770,13 +997,15 @@ void iterate_supers_type(struct file_system_type *type, spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); - up_read(&sb->s_umount); + super_unlock_shared(sb); spin_lock(&sb_lock); if (p) @@ -791,43 +1020,6 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); /** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) -{ - struct super_block *sb; - - if (!bdev) - return NULL; - - spin_lock(&sb_lock); -rescan: - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - if (sb->s_bdev == bdev) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) - return sb; - up_read(&sb->s_umount); - /* nope, got unmounted */ - spin_lock(&sb_lock); - __put_super(sb); - goto rescan; - } - } - spin_unlock(&sb_lock); - return NULL; -} - -/** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * @@ -842,15 +1034,12 @@ struct super_block *get_active_super(struct block_device *bdev) if (!bdev) return NULL; -restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_bdev == bdev) { if (!grab_super(sb)) - goto restart; - up_write(&sb->s_umount); + return NULL; + super_unlock_excl(sb); return sb; } } @@ -863,28 +1052,21 @@ struct super_block *user_get_super(dev_t dev, bool excl) struct super_block *sb; spin_lock(&sb_lock); -rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_dev == dev) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - if (excl) - down_write(&sb->s_umount); - else - down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock(sb, excl); + if (born && sb->s_root) return sb; - if (excl) - up_write(&sb->s_umount); - else - up_read(&sb->s_umount); + super_unlock(sb, excl); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); - goto rescan; + break; } } spin_unlock(&sb_lock); @@ -926,9 +1108,9 @@ int reconfigure_super(struct fs_context *fc) if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { - up_write(&sb->s_umount); + super_unlock_excl(sb); group_pin_kill(&sb->s_pins); - down_write(&sb->s_umount); + __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) @@ -991,9 +1173,9 @@ cancel_readonly: static void do_emergency_remount_callback(struct super_block *sb) { - down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && - !sb_rdonly(sb)) { + bool born = super_lock_excl(sb); + + if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1004,7 +1186,7 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - up_write(&sb->s_umount); + super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) @@ -1027,12 +1209,15 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { - down_write(&sb->s_umount); - if (sb->s_root && sb->s_flags & SB_BORN) { - emergency_thaw_bdev(sb); - thaw_super_locked(sb); + bool born = super_lock_excl(sb); + + if (born && sb->s_root) { + if (IS_ENABLED(CONFIG_BLOCK)) + while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); + thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); } else { - up_write(&sb->s_umount); + super_unlock_excl(sb); } } @@ -1108,6 +1293,7 @@ void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); + kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); @@ -1136,7 +1322,7 @@ static int test_single_super(struct super_block *s, struct fs_context *fc) return 1; } -static int vfs_get_super(struct fs_context *fc, bool reconf, +static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) @@ -1154,19 +1340,9 @@ static int vfs_get_super(struct fs_context *fc, bool reconf, goto error; sb->s_flags |= SB_ACTIVE; - fc->root = dget(sb->s_root); - } else { - fc->root = dget(sb->s_root); - if (reconf) { - err = reconfigure_super(fc); - if (err < 0) { - dput(fc->root); - fc->root = NULL; - goto error; - } - } } + fc->root = dget(sb->s_root); return 0; error: @@ -1178,7 +1354,7 @@ int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, false, NULL, fill_super); + return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); @@ -1186,66 +1362,193 @@ int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, false, test_single_super, fill_super); + return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); -int get_tree_single_reconf(struct fs_context *fc, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) -{ - return vfs_get_super(fc, true, test_single_super, fill_super); -} -EXPORT_SYMBOL(get_tree_single_reconf); - int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; - return vfs_get_super(fc, false, test_keyed_super, fill_super); + return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_dev = *(dev_t *)data; + return 0; +} + +static int super_s_dev_set(struct super_block *s, struct fs_context *fc) +{ + return set_bdev_super(s, fc->sget_key); +} + +static int super_s_dev_test(struct super_block *s, struct fs_context *fc) +{ + return !(s->s_iflags & SB_I_RETIRED) && + s->s_dev == *(dev_t *)fc->sget_key; +} + +/** + * sget_dev - Find or create a superblock by device number + * @fc: Filesystem context. + * @dev: device number + * + * Find or create a superblock using the provided device number that + * will be stored in fc->sget_key. + * + * If an extant superblock is matched, then that will be returned with + * an elevated reference count that the caller must transfer or discard. + * + * If no match is made, a new superblock will be allocated and basic + * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will + * be set). The superblock will be published and it will be returned in + * a partially constructed state with SB_BORN and SB_ACTIVE as yet + * unset. + * + * Return: an existing or newly created superblock on success, an error + * pointer on failure. + */ +struct super_block *sget_dev(struct fs_context *fc, dev_t dev) +{ + fc->sget_key = &dev; + return sget_fc(fc, super_s_dev_test, super_s_dev_set); +} +EXPORT_SYMBOL(sget_dev); + #ifdef CONFIG_BLOCK -static void fs_mark_dead(struct block_device *bdev) +/* + * Lock the superblock that is holder of the bdev. Returns the superblock + * pointer if we successfully locked the superblock and it is alive. Otherwise + * we return NULL and just unlock bdev->bd_holder_lock. + * + * The function must be called with bdev->bd_holder_lock and releases it. + */ +static struct super_block *bdev_super_lock_shared(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct super_block *sb = bdev->bd_holder; + bool born; + + lockdep_assert_held(&bdev->bd_holder_lock); + lockdep_assert_not_held(&sb->s_umount); + lockdep_assert_not_held(&bdev->bd_disk->open_mutex); + + /* Make sure sb doesn't go away from under us */ + spin_lock(&sb_lock); + sb->s_count++; + spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_holder_lock); + + born = super_lock_shared(sb); + if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { + super_unlock_shared(sb); + put_super(sb); + return NULL; + } + /* + * The superblock is active and we hold s_umount, we can drop our + * temporary reference now. + */ + put_super(sb); + return sb; +} + +static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; - sb = get_super(bdev); + sb = bdev_super_lock_shared(bdev); if (!sb) return; + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); - drop_super(sb); -} -static const struct blk_holder_ops fs_holder_ops = { - .mark_dead = fs_mark_dead, -}; + super_unlock_shared(sb); +} -static int set_bdev_super(struct super_block *s, void *data) +static void fs_bdev_sync(struct block_device *bdev) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); + struct super_block *sb; - if (bdev_stable_writes(s->s_bdev)) - s->s_iflags |= SB_I_STABLE_WRITES; - return 0; + sb = bdev_super_lock_shared(bdev); + if (!sb) + return; + sync_filesystem(sb); + super_unlock_shared(sb); } -static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) -{ - return set_bdev_super(s, fc->sget_key); -} +const struct blk_holder_ops fs_holder_ops = { + .mark_dead = fs_bdev_mark_dead, + .sync = fs_bdev_sync, +}; +EXPORT_SYMBOL_GPL(fs_holder_ops); -static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) +int setup_bdev_super(struct super_block *sb, int sb_flags, + struct fs_context *fc) { - return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; + blk_mode_t mode = sb_open_mode(sb_flags); + struct bdev_handle *bdev_handle; + struct block_device *bdev; + + bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev_handle)) { + if (fc) + errorf(fc, "%s: Can't open blockdev", fc->source); + return PTR_ERR(bdev_handle); + } + bdev = bdev_handle->bdev; + + /* + * This really should be in blkdev_get_by_dev, but right now can't due + * to legacy issues that require us to allow opening a block device node + * writable from userspace even for a read-only block device. + */ + if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { + bdev_release(bdev_handle); + return -EACCES; + } + + /* + * Until SB_BORN flag is set, there can be no active superblock + * references and thus no filesystem freezing. get_active_super() will + * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. + * + * It is enough to check bdev was not frozen before we set s_bdev. + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (fc) + warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); + bdev_release(bdev_handle); + return -EBUSY; + } + spin_lock(&sb_lock); + sb->s_bdev_handle = bdev_handle; + sb->s_bdev = bdev; + sb->s_bdi = bdi_get(bdev->bd_disk->bdi); + if (bdev_stable_writes(bdev)) + sb->s_iflags |= SB_I_STABLE_WRITES; + spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, + sb->s_id); + sb_set_blocksize(sb, block_size(bdev)); + return 0; } +EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev - Get a superblock based on a single block device @@ -1256,73 +1559,48 @@ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { - struct block_device *bdev; struct super_block *s; int error = 0; + dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); - bdev = blkdev_get_by_path(fc->source, sb_open_mode(fc->sb_flags), - fc->fs_type, &fs_holder_ops); - if (IS_ERR(bdev)) { - errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev); - } - - /* Once the superblock is inserted into the list by sget_fc(), s_umount - * will protect the lockfs code from trying to start a snapshot while - * we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - blkdev_put(bdev, fc->fs_type); - return -EBUSY; + error = lookup_bdev(fc->source, &dev); + if (error) { + errorf(fc, "%s: Can't lookup blockdev", fc->source); + return error; } fc->sb_flags |= SB_NOSEC; - fc->sget_key = bdev; - s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) { - blkdev_put(bdev, fc->fs_type); + s = sget_dev(fc, dev); + if (IS_ERR(s)) return PTR_ERR(s); - } if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { - warnf(fc, "%pg: Can't mount, would change RO state", bdev); + warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); - blkdev_put(bdev, fc->fs_type); return -EBUSY; } - + } else { /* - * s_umount nests inside open_mutex during - * __invalidate_device(). blkdev_put() acquires - * open_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * bdev_mark_dead()). It is safe because we have active sb + * reference and SB_BORN is not set yet. */ - up_write(&s->s_umount); - blkdev_put(bdev, fc->fs_type); - down_write(&s->s_umount); - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", - fc->fs_type->name, s->s_id); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, fc); + super_unlock_excl(s); + error = setup_bdev_super(s, fc->sb_flags, fc); + __super_lock_excl(s); + if (!error) + error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } - s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } BUG_ON(fc->root); @@ -1333,79 +1611,52 @@ EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { - return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { - struct block_device *bdev; struct super_block *s; - int error = 0; + int error; + dev_t dev; - bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, - &fs_holder_ops); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + error = lookup_bdev(dev_name, &dev); + if (error) + return ERR_PTR(error); - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = -EBUSY; - goto error_bdev; - } - s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC, - bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); + flags |= SB_NOSEC; + s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) - goto error_s; + return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); - error = -EBUSY; - goto error_bdev; + return ERR_PTR(-EBUSY); } - + } else { /* - * s_umount nests inside open_mutex during - * __invalidate_device(). blkdev_put() acquires - * open_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * bdev_mark_dead()). It is safe because we have active sb + * reference and SB_BORN is not set yet. */ - up_write(&s->s_umount); - blkdev_put(bdev, fs_type); - down_write(&s->s_umount); - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", - fs_type->name, s->s_id); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); + super_unlock_excl(s); + error = setup_bdev_super(s, flags, NULL); + __super_lock_excl(s); + if (!error) + error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - goto error; + return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } return dget(s->s_root); - -error_s: - error = PTR_ERR(s); -error_bdev: - blkdev_put(bdev, fs_type); -error: - return ERR_PTR(error); } EXPORT_SYMBOL(mount_bdev); @@ -1413,10 +1664,11 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; - bdev->bd_super = NULL; generic_shutdown_super(sb); - sync_blockdev(bdev); - blkdev_put(bdev, sb->s_type); + if (bdev) { + sync_blockdev(bdev); + bdev_release(sb->s_bdev_handle); + } } EXPORT_SYMBOL(kill_block_super); @@ -1533,13 +1785,13 @@ int vfs_get_tree(struct fs_context *fc) WARN_ON(!sb->s_bdi); /* - * Write barrier is for super_cache_count(). We place it before setting - * SB_BORN as the data dependency between the two functions is the - * superblock structure contents that we just set up, not the SB_BORN - * flag. + * super_wake() contains a memory barrier which also care of + * ordering for super_cache_count(). We place it before setting + * SB_BORN as the data dependency between the two functions is + * the superblock structure contents that we just set up, not + * the SB_BORN flag. */ - smp_wmb(); - sb->s_flags |= SB_BORN; + super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { @@ -1644,14 +1896,43 @@ static void sb_freeze_unlock(struct super_block *sb, int level) percpu_up_write(sb->s_writers.rw_sem + level); } +static int wait_for_partially_frozen(struct super_block *sb) +{ + int ret = 0; + + do { + unsigned short old = sb->s_writers.frozen; + + up_write(&sb->s_umount); + ret = wait_var_event_killable(&sb->s_writers.frozen, + sb->s_writers.frozen != old); + down_write(&sb->s_umount); + } while (ret == 0 && + sb->s_writers.frozen != SB_UNFROZEN && + sb->s_writers.frozen != SB_FREEZE_COMPLETE); + + return ret; +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock + * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's - * freeze_fs. Subsequent calls to this without first thawing the fs will return + * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * + * @who should be: + * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; + * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. + * + * The @who argument distinguishes between the kernel and userspace trying to + * freeze the filesystem. Although there cannot be multiple kernel freezes or + * multiple userspace freezes in effect at any given time, the kernel and + * userspace can both hold a filesystem frozen. The filesystem remains frozen + * until there are no kernel or userspace freezes in effect. + * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. @@ -1677,34 +1958,62 @@ static void sb_freeze_unlock(struct super_block *sb, int level) * * sb->s_writers.frozen is protected by sb->s_umount. */ -int freeze_super(struct super_block *sb) +int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; atomic_inc(&sb->s_active); - down_write(&sb->s_umount); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while freezing!"); + +retry: + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { + if (sb->s_writers.freeze_holders & who) { + deactivate_locked_super(sb); + return -EBUSY; + } + + WARN_ON(sb->s_writers.freeze_holders == 0); + + /* + * Someone else already holds this type of freeze; share the + * freeze and assign the active ref to the freeze. + */ + sb->s_writers.freeze_holders |= who; + super_unlock_excl(sb); + return 0; + } + if (sb->s_writers.frozen != SB_UNFROZEN) { - deactivate_locked_super(sb); - return -EBUSY; + ret = wait_for_partially_frozen(sb); + if (ret) { + deactivate_locked_super(sb); + return ret; + } + + goto retry; } if (!(sb->s_flags & SB_BORN)) { - up_write(&sb->s_umount); + super_unlock_excl(sb); return 0; /* sic - it's "nothing to do" */ } if (sb_rdonly(sb)) { /* Nothing to do really... */ + sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; - up_write(&sb->s_umount); + wake_up_var(&sb->s_writers.frozen); + super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ - up_write(&sb->s_umount); + super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - down_write(&sb->s_umount); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while freezing!"); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; @@ -1715,6 +2024,7 @@ int freeze_super(struct super_block *sb) if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } @@ -1730,6 +2040,7 @@ int freeze_super(struct super_block *sb) "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); + wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } @@ -1738,24 +2049,50 @@ int freeze_super(struct super_block *sb) * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ + sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; + wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); - up_write(&sb->s_umount); + super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); -static int thaw_super_locked(struct super_block *sb) +/* + * Undoes the effect of a freeze_super_locked call. If the filesystem is + * frozen both by userspace and the kernel, a thaw call from either source + * removes that state without releasing the other state or unlocking the + * filesystem. + */ +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error; - if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { - up_write(&sb->s_umount); + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { + if (!(sb->s_writers.freeze_holders & who)) { + super_unlock_excl(sb); + return -EINVAL; + } + + /* + * Freeze is shared with someone else. Release our hold and + * drop the active ref that freeze_super assigned to the + * freezer. + */ + if (sb->s_writers.freeze_holders & ~who) { + sb->s_writers.freeze_holders &= ~who; + deactivate_locked_super(sb); + return 0; + } + } else { + super_unlock_excl(sb); return -EINVAL; } if (sb_rdonly(sb)) { + sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; + wake_up_var(&sb->s_writers.frozen); goto out; } @@ -1764,15 +2101,16 @@ static int thaw_super_locked(struct super_block *sb) if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); + printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); - up_write(&sb->s_umount); + super_unlock_excl(sb); return error; } } + sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; + wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out: deactivate_locked_super(sb); @@ -1782,13 +2120,20 @@ out: /** * thaw_super -- unlock filesystem * @sb: the super to thaw + * @who: context that wants to freeze + * + * Unlocks the filesystem and marks it writeable again after freeze_super() + * if there are no remaining freezes on the filesystem. * - * Unlocks the filesystem and marks it writeable again after freeze_super(). + * @who should be: + * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; + * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. */ -int thaw_super(struct super_block *sb) +int thaw_super(struct super_block *sb, enum freeze_holder who) { - down_write(&sb->s_umount); - return thaw_super_locked(sb); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while thawing!"); + return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); @@ -1815,3 +2160,4 @@ int sb_init_dio_done_wq(struct super_block *sb) destroy_workqueue(wq); return 0; } +EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index a12ac0356c69..6b7652fb8050 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -167,6 +167,18 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, return battr->mmap(of->file, kobj, battr, vma); } +static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset, + int whence) +{ + struct bin_attribute *battr = of->kn->priv; + struct kobject *kobj = of->kn->parent->priv; + + if (battr->llseek) + return battr->llseek(of->file, kobj, battr, offset, whence); + else + return generic_file_llseek(of->file, offset, whence); +} + static int sysfs_kf_bin_open(struct kernfs_open_file *of) { struct bin_attribute *battr = of->kn->priv; @@ -249,6 +261,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = { .write = sysfs_kf_bin_write, .mmap = sysfs_kf_bin_mmap, .open = sysfs_kf_bin_open, + .llseek = sysfs_kf_bin_llseek, }; int sysfs_add_file_mode_ns(struct kernfs_node *parent, diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index b4e23e03fbeb..67b3f90afbfd 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -2,6 +2,7 @@ config SYSV_FS tristate "System V/Xenix/V7/Coherent file system support" depends on BLOCK + select BUFFER_HEAD help SCO, Xenix and Coherent are commercial Unix systems for Intel machines, and Version 7 was used on the DEC PDP-11. Saying Y diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 0140010aa0c3..2e126d72d619 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -224,7 +224,7 @@ got_it: memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: @@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) } de->inode = 0; dir_commit_chunk(page, pos, SYSV_DIRSIZE); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); } @@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index e732879036ab..269df6d49815 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) dirty_sb(sb); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); SYSV_I(inode)->i_dir_start_lookup = 0; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 9e8d4a6fb2f3..5a915b2e68f5 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -200,12 +200,9 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid)); set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); - inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); - inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); - inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime); - inode->i_ctime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; + inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0); + inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0); + inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0); inode->i_blocks = 0; si = SYSV_I(inode); @@ -254,9 +251,9 @@ static int __sysv_write_inode(struct inode *inode, int wait) raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode))); raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); - raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); - raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec); - raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec); + raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode)); + raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode)); + raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode)); si = SYSV_I(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 58d7f43a1371..725981474e5f 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -183,7 +183,7 @@ static inline int splice_branch(struct inode *inode, *where->p = where->key; write_unlock(&pointers_lock); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -423,7 +423,7 @@ do_indirects: } n++; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) sysv_sync_inode (inode); else @@ -449,7 +449,8 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index fcf163fea3ad..d6b73798071b 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -103,7 +103,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -161,7 +161,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) err = sysv_delete_entry(de, page); if (!err) { - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); } unmap_and_put_page(page, de); @@ -230,7 +230,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, unmap_and_put_page(new_page, new_de); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile index 7c35a282b484..73c56da8e284 100644 --- a/fs/tracefs/Makefile +++ b/fs/tracefs/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only tracefs-objs := inode.o +tracefs-objs += event_inode.o obj-$(CONFIG_TRACING) += tracefs.o diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c new file mode 100644 index 000000000000..2ccc849a5bda --- /dev/null +++ b/fs/tracefs/event_inode.c @@ -0,0 +1,1077 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * event_inode.c - part of tracefs, a pseudo file system for activating tracing + * + * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt <rostedt@goodmis.org> + * Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com> + * Copyright (C) 2023 Google, author: Steven Rostedt <rostedt@goodmis.org> + * + * eventfs is used to dynamically create inodes and dentries based on the + * meta data provided by the tracing system. + * + * eventfs stores the meta-data of files/dirs and holds off on creating + * inodes/dentries of the files. When accessed, the eventfs will create the + * inodes/dentries in a just-in-time (JIT) manner. The eventfs will clean up + * and delete the inodes/dentries when they are no longer referenced. + */ +#include <linux/fsnotify.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/tracefs.h> +#include <linux/kref.h> +#include <linux/delay.h> +#include "internal.h" + +/* + * eventfs_mutex protects the eventfs_inode (ei) dentry. Any access + * to the ei->dentry must be done under this mutex and after checking + * if ei->is_freed is not set. When ei->is_freed is set, the dentry + * is on its way to being freed after the last dput() is made on it. + */ +static DEFINE_MUTEX(eventfs_mutex); + +/* + * The eventfs_inode (ei) itself is protected by SRCU. It is released from + * its parent's list and will have is_freed set (under eventfs_mutex). + * After the SRCU grace period is over and the last dput() is called + * the ei is freed. + */ +DEFINE_STATIC_SRCU(eventfs_srcu); + +/* Mode is unsigned short, use the upper bits for flags */ +enum { + EVENTFS_SAVE_MODE = BIT(16), + EVENTFS_SAVE_UID = BIT(17), + EVENTFS_SAVE_GID = BIT(18), +}; + +#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1) + +static struct dentry *eventfs_root_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags); +static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); +static int eventfs_release(struct inode *inode, struct file *file); + +static void update_attr(struct eventfs_attr *attr, struct iattr *iattr) +{ + unsigned int ia_valid = iattr->ia_valid; + + if (ia_valid & ATTR_MODE) { + attr->mode = (attr->mode & ~EVENTFS_MODE_MASK) | + (iattr->ia_mode & EVENTFS_MODE_MASK) | + EVENTFS_SAVE_MODE; + } + if (ia_valid & ATTR_UID) { + attr->mode |= EVENTFS_SAVE_UID; + attr->uid = iattr->ia_uid; + } + if (ia_valid & ATTR_GID) { + attr->mode |= EVENTFS_SAVE_GID; + attr->gid = iattr->ia_gid; + } +} + +static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *iattr) +{ + const struct eventfs_entry *entry; + struct eventfs_inode *ei; + const char *name; + int ret; + + mutex_lock(&eventfs_mutex); + ei = dentry->d_fsdata; + if (ei->is_freed) { + /* Do not allow changes if the event is about to be removed. */ + mutex_unlock(&eventfs_mutex); + return -ENODEV; + } + + /* Preallocate the children mode array if necessary */ + if (!(dentry->d_inode->i_mode & S_IFDIR)) { + if (!ei->entry_attrs) { + ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries, + GFP_NOFS); + if (!ei->entry_attrs) { + ret = -ENOMEM; + goto out; + } + } + } + + ret = simple_setattr(idmap, dentry, iattr); + if (ret < 0) + goto out; + + /* + * If this is a dir, then update the ei cache, only the file + * mode is saved in the ei->m_children, and the ownership is + * determined by the parent directory. + */ + if (dentry->d_inode->i_mode & S_IFDIR) { + update_attr(&ei->attr, iattr); + + } else { + name = dentry->d_name.name; + + for (int i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (strcmp(name, entry->name) == 0) { + update_attr(&ei->entry_attrs[i], iattr); + break; + } + } + } + out: + mutex_unlock(&eventfs_mutex); + return ret; +} + +static const struct inode_operations eventfs_root_dir_inode_operations = { + .lookup = eventfs_root_lookup, + .setattr = eventfs_set_attr, +}; + +static const struct inode_operations eventfs_file_inode_operations = { + .setattr = eventfs_set_attr, +}; + +static const struct file_operations eventfs_file_operations = { + .open = dcache_dir_open_wrapper, + .read = generic_read_dir, + .iterate_shared = dcache_readdir_wrapper, + .llseek = generic_file_llseek, + .release = eventfs_release, +}; + +static void update_inode_attr(struct dentry *dentry, struct inode *inode, + struct eventfs_attr *attr, umode_t mode) +{ + if (!attr) { + inode->i_mode = mode; + return; + } + + if (attr->mode & EVENTFS_SAVE_MODE) + inode->i_mode = attr->mode & EVENTFS_MODE_MASK; + else + inode->i_mode = mode; + + if (attr->mode & EVENTFS_SAVE_UID) + inode->i_uid = attr->uid; + else + inode->i_uid = d_inode(dentry->d_parent)->i_uid; + + if (attr->mode & EVENTFS_SAVE_GID) + inode->i_gid = attr->gid; + else + inode->i_gid = d_inode(dentry->d_parent)->i_gid; +} + +/** + * create_file - create a file in the tracefs filesystem + * @name: the name of the file to create. + * @mode: the permission that the file should have. + * @attr: saved attributes changed by user + * @parent: parent dentry for this file. + * @data: something that the caller will want to get to later on. + * @fop: struct file_operations that should be used for this file. + * + * This function creates a dentry that represents a file in the eventsfs_inode + * directory. The inode.i_private pointer will point to @data in the open() + * call. + */ +static struct dentry *create_file(const char *name, umode_t mode, + struct eventfs_attr *attr, + struct dentry *parent, void *data, + const struct file_operations *fop) +{ + struct tracefs_inode *ti; + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + + if (WARN_ON_ONCE(!S_ISREG(mode))) + return NULL; + + WARN_ON_ONCE(!parent); + dentry = eventfs_start_creating(name, parent); + + if (IS_ERR(dentry)) + return dentry; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return eventfs_failed_creating(dentry); + + /* If the user updated the directory's attributes, use them */ + update_inode_attr(dentry, inode, attr, mode); + + inode->i_op = &eventfs_file_inode_operations; + inode->i_fop = fop; + inode->i_private = data; + + ti = get_tracefs(inode); + ti->flags |= TRACEFS_EVENT_INODE; + d_instantiate(dentry, inode); + fsnotify_create(dentry->d_parent->d_inode, dentry); + return eventfs_end_creating(dentry); +}; + +/** + * create_dir - create a dir in the tracefs filesystem + * @ei: the eventfs_inode that represents the directory to create + * @parent: parent dentry for this file. + * + * This function will create a dentry for a directory represented by + * a eventfs_inode. + */ +static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent) +{ + struct tracefs_inode *ti; + struct dentry *dentry; + struct inode *inode; + + dentry = eventfs_start_creating(ei->name, parent); + if (IS_ERR(dentry)) + return dentry; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return eventfs_failed_creating(dentry); + + /* If the user updated the directory's attributes, use them */ + update_inode_attr(dentry, inode, &ei->attr, + S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + + inode->i_op = &eventfs_root_dir_inode_operations; + inode->i_fop = &eventfs_file_operations; + + ti = get_tracefs(inode); + ti->flags |= TRACEFS_EVENT_INODE; + + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return eventfs_end_creating(dentry); +} + +static void free_ei(struct eventfs_inode *ei) +{ + kfree_const(ei->name); + kfree(ei->d_children); + kfree(ei->entry_attrs); + kfree(ei); +} + +/** + * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode + * @ti: the tracefs_inode of the dentry + * @dentry: dentry which has the reference to remove. + * + * Remove the association between a dentry from an eventfs_inode. + */ +void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry) +{ + struct eventfs_inode *ei; + int i; + + mutex_lock(&eventfs_mutex); + + ei = dentry->d_fsdata; + if (!ei) + goto out; + + /* This could belong to one of the files of the ei */ + if (ei->dentry != dentry) { + for (i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i] == dentry) + break; + } + if (WARN_ON_ONCE(i == ei->nr_entries)) + goto out; + ei->d_children[i] = NULL; + } else if (ei->is_freed) { + free_ei(ei); + } else { + ei->dentry = NULL; + } + + dentry->d_fsdata = NULL; + out: + mutex_unlock(&eventfs_mutex); +} + +/** + * create_file_dentry - create a dentry for a file of an eventfs_inode + * @ei: the eventfs_inode that the file will be created under + * @idx: the index into the d_children[] of the @ei + * @parent: The parent dentry of the created file. + * @name: The name of the file to create + * @mode: The mode of the file. + * @data: The data to use to set the inode of the file with on open() + * @fops: The fops of the file to be created. + * @lookup: If called by the lookup routine, in which case, dput() the created dentry. + * + * Create a dentry for a file of an eventfs_inode @ei and place it into the + * address located at @e_dentry. If the @e_dentry already has a dentry, then + * just do a dget() on it and return. Otherwise create the dentry and attach it. + */ +static struct dentry * +create_file_dentry(struct eventfs_inode *ei, int idx, + struct dentry *parent, const char *name, umode_t mode, void *data, + const struct file_operations *fops, bool lookup) +{ + struct eventfs_attr *attr = NULL; + struct dentry **e_dentry = &ei->d_children[idx]; + struct dentry *dentry; + + WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); + + mutex_lock(&eventfs_mutex); + if (ei->is_freed) { + mutex_unlock(&eventfs_mutex); + return NULL; + } + /* If the e_dentry already has a dentry, use it */ + if (*e_dentry) { + /* lookup does not need to up the ref count */ + if (!lookup) + dget(*e_dentry); + mutex_unlock(&eventfs_mutex); + return *e_dentry; + } + + /* ei->entry_attrs are protected by SRCU */ + if (ei->entry_attrs) + attr = &ei->entry_attrs[idx]; + + mutex_unlock(&eventfs_mutex); + + dentry = create_file(name, mode, attr, parent, data, fops); + + mutex_lock(&eventfs_mutex); + + if (IS_ERR_OR_NULL(dentry)) { + /* + * When the mutex was released, something else could have + * created the dentry for this e_dentry. In which case + * use that one. + * + * If ei->is_freed is set, the e_dentry is currently on its + * way to being freed, don't return it. If e_dentry is NULL + * it means it was already freed. + */ + if (ei->is_freed) + dentry = NULL; + else + dentry = *e_dentry; + /* The lookup does not need to up the dentry refcount */ + if (dentry && !lookup) + dget(dentry); + mutex_unlock(&eventfs_mutex); + return dentry; + } + + if (!*e_dentry && !ei->is_freed) { + *e_dentry = dentry; + dentry->d_fsdata = ei; + } else { + /* + * Should never happen unless we get here due to being freed. + * Otherwise it means two dentries exist with the same name. + */ + WARN_ON_ONCE(!ei->is_freed); + dentry = NULL; + } + mutex_unlock(&eventfs_mutex); + + if (lookup) + dput(dentry); + + return dentry; +} + +/** + * eventfs_post_create_dir - post create dir routine + * @ei: eventfs_inode of recently created dir + * + * Map the meta-data of files within an eventfs dir to their parent dentry + */ +static void eventfs_post_create_dir(struct eventfs_inode *ei) +{ + struct eventfs_inode *ei_child; + struct tracefs_inode *ti; + + lockdep_assert_held(&eventfs_mutex); + + /* srcu lock already held */ + /* fill parent-child relation */ + list_for_each_entry_srcu(ei_child, &ei->children, list, + srcu_read_lock_held(&eventfs_srcu)) { + ei_child->d_parent = ei->dentry; + } + + ti = get_tracefs(ei->dentry->d_inode); + ti->private = ei; +} + +/** + * create_dir_dentry - Create a directory dentry for the eventfs_inode + * @pei: The eventfs_inode parent of ei. + * @ei: The eventfs_inode to create the directory for + * @parent: The dentry of the parent of this directory + * @lookup: True if this is called by the lookup code + * + * This creates and attaches a directory dentry to the eventfs_inode @ei. + */ +static struct dentry * +create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei, + struct dentry *parent, bool lookup) +{ + struct dentry *dentry = NULL; + + WARN_ON_ONCE(!inode_is_locked(parent->d_inode)); + + mutex_lock(&eventfs_mutex); + if (pei->is_freed || ei->is_freed) { + mutex_unlock(&eventfs_mutex); + return NULL; + } + if (ei->dentry) { + /* If the dentry already has a dentry, use it */ + dentry = ei->dentry; + /* lookup does not need to up the ref count */ + if (!lookup) + dget(dentry); + mutex_unlock(&eventfs_mutex); + return dentry; + } + mutex_unlock(&eventfs_mutex); + + dentry = create_dir(ei, parent); + + mutex_lock(&eventfs_mutex); + + if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) { + /* + * When the mutex was released, something else could have + * created the dentry for this e_dentry. In which case + * use that one. + * + * If ei->is_freed is set, the e_dentry is currently on its + * way to being freed. + */ + dentry = ei->dentry; + if (dentry && !lookup) + dget(dentry); + mutex_unlock(&eventfs_mutex); + return dentry; + } + + if (!ei->dentry && !ei->is_freed) { + ei->dentry = dentry; + eventfs_post_create_dir(ei); + dentry->d_fsdata = ei; + } else { + /* + * Should never happen unless we get here due to being freed. + * Otherwise it means two dentries exist with the same name. + */ + WARN_ON_ONCE(!ei->is_freed); + dentry = NULL; + } + mutex_unlock(&eventfs_mutex); + + if (lookup) + dput(dentry); + + return dentry; +} + +/** + * eventfs_root_lookup - lookup routine to create file/dir + * @dir: in which a lookup is being done + * @dentry: file/dir dentry + * @flags: Just passed to simple_lookup() + * + * Used to create dynamic file/dir with-in @dir, search with-in @ei + * list, if @dentry found go ahead and create the file/dir + */ + +static struct dentry *eventfs_root_lookup(struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + const struct file_operations *fops; + const struct eventfs_entry *entry; + struct eventfs_inode *ei_child; + struct tracefs_inode *ti; + struct eventfs_inode *ei; + struct dentry *ei_dentry = NULL; + struct dentry *ret = NULL; + const char *name = dentry->d_name.name; + bool created = false; + umode_t mode; + void *data; + int idx; + int i; + int r; + + ti = get_tracefs(dir); + if (!(ti->flags & TRACEFS_EVENT_INODE)) + return NULL; + + /* Grab srcu to prevent the ei from going away */ + idx = srcu_read_lock(&eventfs_srcu); + + /* + * Grab the eventfs_mutex to consistent value from ti->private. + * This s + */ + mutex_lock(&eventfs_mutex); + ei = READ_ONCE(ti->private); + if (ei && !ei->is_freed) + ei_dentry = READ_ONCE(ei->dentry); + mutex_unlock(&eventfs_mutex); + + if (!ei || !ei_dentry) + goto out; + + data = ei->data; + + list_for_each_entry_srcu(ei_child, &ei->children, list, + srcu_read_lock_held(&eventfs_srcu)) { + if (strcmp(ei_child->name, name) != 0) + continue; + ret = simple_lookup(dir, dentry, flags); + if (IS_ERR(ret)) + goto out; + create_dir_dentry(ei, ei_child, ei_dentry, true); + created = true; + break; + } + + if (created) + goto out; + + for (i = 0; i < ei->nr_entries; i++) { + entry = &ei->entries[i]; + if (strcmp(name, entry->name) == 0) { + void *cdata = data; + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); + if (r <= 0) + continue; + ret = simple_lookup(dir, dentry, flags); + if (IS_ERR(ret)) + goto out; + create_file_dentry(ei, i, ei_dentry, name, mode, cdata, + fops, true); + break; + } + } + out: + srcu_read_unlock(&eventfs_srcu, idx); + return ret; +} + +struct dentry_list { + void *cursor; + struct dentry **dentries; +}; + +/** + * eventfs_release - called to release eventfs file/dir + * @inode: inode to be released + * @file: file to be released (not used) + */ +static int eventfs_release(struct inode *inode, struct file *file) +{ + struct tracefs_inode *ti; + struct dentry_list *dlist = file->private_data; + void *cursor; + int i; + + ti = get_tracefs(inode); + if (!(ti->flags & TRACEFS_EVENT_INODE)) + return -EINVAL; + + if (WARN_ON_ONCE(!dlist)) + return -EINVAL; + + for (i = 0; dlist->dentries && dlist->dentries[i]; i++) { + dput(dlist->dentries[i]); + } + + cursor = dlist->cursor; + kfree(dlist->dentries); + kfree(dlist); + file->private_data = cursor; + return dcache_dir_close(inode, file); +} + +static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt) +{ + struct dentry **tmp; + + tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS); + if (!tmp) + return -1; + tmp[cnt] = d; + tmp[cnt + 1] = NULL; + *dentries = tmp; + return 0; +} + +/** + * dcache_dir_open_wrapper - eventfs open wrapper + * @inode: not used + * @file: dir to be opened (to create it's children) + * + * Used to dynamic create file/dir with-in @file, all the + * file/dir will be created. If already created then references + * will be increased + */ +static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) +{ + const struct file_operations *fops; + const struct eventfs_entry *entry; + struct eventfs_inode *ei_child; + struct tracefs_inode *ti; + struct eventfs_inode *ei; + struct dentry_list *dlist; + struct dentry **dentries = NULL; + struct dentry *parent = file_dentry(file); + struct dentry *d; + struct inode *f_inode = file_inode(file); + const char *name = parent->d_name.name; + umode_t mode; + void *data; + int cnt = 0; + int idx; + int ret; + int i; + int r; + + ti = get_tracefs(f_inode); + if (!(ti->flags & TRACEFS_EVENT_INODE)) + return -EINVAL; + + if (WARN_ON_ONCE(file->private_data)) + return -EINVAL; + + idx = srcu_read_lock(&eventfs_srcu); + + mutex_lock(&eventfs_mutex); + ei = READ_ONCE(ti->private); + mutex_unlock(&eventfs_mutex); + + if (!ei) { + srcu_read_unlock(&eventfs_srcu, idx); + return -EINVAL; + } + + + data = ei->data; + + dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); + if (!dlist) { + srcu_read_unlock(&eventfs_srcu, idx); + return -ENOMEM; + } + + inode_lock(parent->d_inode); + list_for_each_entry_srcu(ei_child, &ei->children, list, + srcu_read_lock_held(&eventfs_srcu)) { + d = create_dir_dentry(ei, ei_child, parent, false); + if (d) { + ret = add_dentries(&dentries, d, cnt); + if (ret < 0) + break; + cnt++; + } + } + + for (i = 0; i < ei->nr_entries; i++) { + void *cdata = data; + entry = &ei->entries[i]; + name = entry->name; + mutex_lock(&eventfs_mutex); + /* If ei->is_freed, then the event itself may be too */ + if (!ei->is_freed) + r = entry->callback(name, &mode, &cdata, &fops); + else + r = -1; + mutex_unlock(&eventfs_mutex); + if (r <= 0) + continue; + d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false); + if (d) { + ret = add_dentries(&dentries, d, cnt); + if (ret < 0) + break; + cnt++; + } + } + inode_unlock(parent->d_inode); + srcu_read_unlock(&eventfs_srcu, idx); + ret = dcache_dir_open(inode, file); + + /* + * dcache_dir_open() sets file->private_data to a dentry cursor. + * Need to save that but also save all the dentries that were + * opened by this function. + */ + dlist->cursor = file->private_data; + dlist->dentries = dentries; + file->private_data = dlist; + return ret; +} + +/* + * This just sets the file->private_data back to the cursor and back. + */ +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) +{ + struct dentry_list *dlist = file->private_data; + int ret; + + file->private_data = dlist->cursor; + ret = dcache_readdir(file, ctx); + dlist->cursor = file->private_data; + file->private_data = dlist; + return ret; +} + +/** + * eventfs_create_dir - Create the eventfs_inode for this directory + * @name: The name of the directory to create. + * @parent: The eventfs_inode of the parent directory. + * @entries: A list of entries that represent the files under this directory + * @size: The number of @entries + * @data: The default data to pass to the files (an entry may override it). + * + * This function creates the descriptor to represent a directory in the + * eventfs. This descriptor is an eventfs_inode, and it is returned to be + * used to create other children underneath. + * + * The @entries is an array of eventfs_entry structures which has: + * const char *name + * eventfs_callback callback; + * + * The name is the name of the file, and the callback is a pointer to a function + * that will be called when the file is reference (either by lookup or by + * reading a directory). The callback is of the prototype: + * + * int callback(const char *name, umode_t *mode, void **data, + * const struct file_operations **fops); + * + * When a file needs to be created, this callback will be called with + * name = the name of the file being created (so that the same callback + * may be used for multiple files). + * mode = a place to set the file's mode + * data = A pointer to @data, and the callback may replace it, which will + * cause the file created to pass the new data to the open() call. + * fops = the fops to use for the created file. + * + * NB. @callback is called while holding internal locks of the eventfs + * system. The callback must not call any code that might also call into + * the tracefs or eventfs system or it will risk creating a deadlock. + */ +struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode *parent, + const struct eventfs_entry *entries, + int size, void *data) +{ + struct eventfs_inode *ei; + + if (!parent) + return ERR_PTR(-EINVAL); + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + return ERR_PTR(-ENOMEM); + + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) { + kfree(ei); + return ERR_PTR(-ENOMEM); + } + + if (size) { + ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + if (!ei->d_children) { + kfree_const(ei->name); + kfree(ei); + return ERR_PTR(-ENOMEM); + } + } + + ei->entries = entries; + ei->nr_entries = size; + ei->data = data; + INIT_LIST_HEAD(&ei->children); + INIT_LIST_HEAD(&ei->list); + + mutex_lock(&eventfs_mutex); + if (!parent->is_freed) { + list_add_tail(&ei->list, &parent->children); + ei->d_parent = parent->dentry; + } + mutex_unlock(&eventfs_mutex); + + /* Was the parent freed? */ + if (list_empty(&ei->list)) { + free_ei(ei); + ei = NULL; + } + return ei; +} + +/** + * eventfs_create_events_dir - create the top level events directory + * @name: The name of the top level directory to create. + * @parent: Parent dentry for this file in the tracefs directory. + * @entries: A list of entries that represent the files under this directory + * @size: The number of @entries + * @data: The default data to pass to the files (an entry may override it). + * + * This function creates the top of the trace event directory. + * + * See eventfs_create_dir() for use of @entries. + */ +struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry *parent, + const struct eventfs_entry *entries, + int size, void *data) +{ + struct dentry *dentry = tracefs_start_creating(name, parent); + struct eventfs_inode *ei; + struct tracefs_inode *ti; + struct inode *inode; + + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + + ei = kzalloc(sizeof(*ei), GFP_KERNEL); + if (!ei) + goto fail_ei; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + goto fail; + + if (size) { + ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL); + if (!ei->d_children) + goto fail; + } + + ei->dentry = dentry; + ei->entries = entries; + ei->nr_entries = size; + ei->data = data; + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) + goto fail; + + INIT_LIST_HEAD(&ei->children); + INIT_LIST_HEAD(&ei->list); + + ti = get_tracefs(inode); + ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; + ti->private = ei; + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = &eventfs_root_dir_inode_operations; + inode->i_fop = &eventfs_file_operations; + + dentry->d_fsdata = ei; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + tracefs_end_creating(dentry); + + return ei; + + fail: + kfree(ei->d_children); + kfree(ei); + fail_ei: + tracefs_failed_creating(dentry); + return ERR_PTR(-ENOMEM); +} + +static LLIST_HEAD(free_list); + +static void eventfs_workfn(struct work_struct *work) +{ + struct eventfs_inode *ei, *tmp; + struct llist_node *llnode; + + llnode = llist_del_all(&free_list); + llist_for_each_entry_safe(ei, tmp, llnode, llist) { + /* This dput() matches the dget() from unhook_dentry() */ + for (int i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i]) + dput(ei->d_children[i]); + } + /* This should only get here if it had a dentry */ + if (!WARN_ON_ONCE(!ei->dentry)) + dput(ei->dentry); + } +} + +static DECLARE_WORK(eventfs_work, eventfs_workfn); + +static void free_rcu_ei(struct rcu_head *head) +{ + struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu); + + if (ei->dentry) { + /* Do not free the ei until all references of dentry are gone */ + if (llist_add(&ei->llist, &free_list)) + queue_work(system_unbound_wq, &eventfs_work); + return; + } + + /* If the ei doesn't have a dentry, neither should its children */ + for (int i = 0; i < ei->nr_entries; i++) { + WARN_ON_ONCE(ei->d_children[i]); + } + + free_ei(ei); +} + +static void unhook_dentry(struct dentry *dentry) +{ + if (!dentry) + return; + /* + * Need to add a reference to the dentry that is expected by + * simple_recursive_removal(), which will include a dput(). + */ + dget(dentry); + + /* + * Also add a reference for the dput() in eventfs_workfn(). + * That is required as that dput() will free the ei after + * the SRCU grace period is over. + */ + dget(dentry); +} + +/** + * eventfs_remove_rec - remove eventfs dir or file from list + * @ei: eventfs_inode to be removed. + * @level: prevent recursion from going more than 3 levels deep. + * + * This function recursively removes eventfs_inodes which + * contains info of files and/or directories. + */ +static void eventfs_remove_rec(struct eventfs_inode *ei, int level) +{ + struct eventfs_inode *ei_child; + + if (!ei) + return; + /* + * Check recursion depth. It should never be greater than 3: + * 0 - events/ + * 1 - events/group/ + * 2 - events/group/event/ + * 3 - events/group/event/file + */ + if (WARN_ON_ONCE(level > 3)) + return; + + /* search for nested folders or files */ + list_for_each_entry_srcu(ei_child, &ei->children, list, + lockdep_is_held(&eventfs_mutex)) { + /* Children only have dentry if parent does */ + WARN_ON_ONCE(ei_child->dentry && !ei->dentry); + eventfs_remove_rec(ei_child, level + 1); + } + + + ei->is_freed = 1; + + for (int i = 0; i < ei->nr_entries; i++) { + if (ei->d_children[i]) { + /* Children only have dentry if parent does */ + WARN_ON_ONCE(!ei->dentry); + unhook_dentry(ei->d_children[i]); + } + } + + unhook_dentry(ei->dentry); + + list_del_rcu(&ei->list); + call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei); +} + +/** + * eventfs_remove_dir - remove eventfs dir or file from list + * @ei: eventfs_inode to be removed. + * + * This function acquire the eventfs_mutex lock and call eventfs_remove_rec() + */ +void eventfs_remove_dir(struct eventfs_inode *ei) +{ + struct dentry *dentry; + + if (!ei) + return; + + mutex_lock(&eventfs_mutex); + dentry = ei->dentry; + eventfs_remove_rec(ei, 0); + mutex_unlock(&eventfs_mutex); + + /* + * If any of the ei children has a dentry, then the ei itself + * must have a dentry. + */ + if (dentry) + simple_recursive_removal(dentry, NULL); +} + +/** + * eventfs_remove_events_dir - remove the top level eventfs directory + * @ei: the event_inode returned by eventfs_create_events_dir(). + * + * This function removes the events main directory + */ +void eventfs_remove_events_dir(struct eventfs_inode *ei) +{ + struct dentry *dentry; + + dentry = ei->dentry; + eventfs_remove_dir(ei); + + /* + * Matches the dget() done by tracefs_start_creating() + * in eventfs_create_events_dir() when it the dentry was + * created. In other words, it's a normal dentry that + * sticks around while the other ei->dentry are created + * and destroyed dynamically. + */ + dput(dentry); +} diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 57ac8aa4a724..ae648deed019 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -21,13 +21,33 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> +#include "internal.h" #define TRACEFS_DEFAULT_MODE 0700 +static struct kmem_cache *tracefs_inode_cachep __ro_after_init; static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; +static struct inode *tracefs_alloc_inode(struct super_block *sb) +{ + struct tracefs_inode *ti; + + ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL); + if (!ti) + return NULL; + + ti->flags = 0; + + return &ti->vfs_inode; +} + +static void tracefs_free_inode(struct inode *inode) +{ + kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode)); +} + static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -127,12 +147,12 @@ static const struct inode_operations tracefs_dir_inode_operations = { .rmdir = tracefs_syscall_rmdir, }; -static struct inode *tracefs_get_inode(struct super_block *sb) +struct inode *tracefs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); } return inode; } @@ -290,6 +310,7 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) struct tracefs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; + umode_t tmp_mode; /* * On remount, only reset mode/uid/gid if they were provided as mount @@ -297,8 +318,9 @@ static int tracefs_apply_options(struct super_block *sb, bool remount) */ if (!remount || opts->opts & BIT(Opt_mode)) { - inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= opts->mode; + tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO; + tmp_mode |= opts->mode; + WRITE_ONCE(inode->i_mode, tmp_mode); } if (!remount || opts->opts & BIT(Opt_uid)) @@ -346,11 +368,31 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root) } static const struct super_operations tracefs_super_operations = { + .alloc_inode = tracefs_alloc_inode, + .free_inode = tracefs_free_inode, + .drop_inode = generic_delete_inode, .statfs = simple_statfs, .remount_fs = tracefs_remount, .show_options = tracefs_show_options, }; +static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + struct tracefs_inode *ti; + + if (!dentry || !inode) + return; + + ti = get_tracefs(inode); + if (ti && ti->flags & TRACEFS_EVENT_INODE) + eventfs_set_ei_status_free(ti, dentry); + iput(inode); +} + +static const struct dentry_operations tracefs_dentry_operations = { + .d_iput = tracefs_dentry_iput, +}; + static int trace_fill_super(struct super_block *sb, void *data, int silent) { static const struct tree_descr trace_files[] = {{""}}; @@ -373,6 +415,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent) goto fail; sb->s_op = &tracefs_super_operations; + sb->s_d_op = &tracefs_dentry_operations; tracefs_apply_options(sb, false); @@ -399,7 +442,7 @@ static struct file_system_type trace_fs_type = { }; MODULE_ALIAS_FS("tracefs"); -static struct dentry *start_creating(const char *name, struct dentry *parent) +struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; @@ -437,7 +480,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) return dentry; } -static struct dentry *failed_creating(struct dentry *dentry) +struct dentry *tracefs_failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); @@ -445,13 +488,82 @@ static struct dentry *failed_creating(struct dentry *dentry) return NULL; } -static struct dentry *end_creating(struct dentry *dentry) +struct dentry *tracefs_end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } /** + * eventfs_start_creating - start the process of creating a dentry + * @name: Name of the file created for the dentry + * @parent: The parent dentry where this dentry will be created + * + * This is a simple helper function for the dynamically created eventfs + * files. When the directory of the eventfs files are accessed, their + * dentries are created on the fly. This function is used to start that + * process. + */ +struct dentry *eventfs_start_creating(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + int error; + + /* Must always have a parent. */ + if (WARN_ON_ONCE(!parent)) + return ERR_PTR(-EINVAL); + + error = simple_pin_fs(&trace_fs_type, &tracefs_mount, + &tracefs_mount_count); + if (error) + return ERR_PTR(error); + + if (unlikely(IS_DEADDIR(parent->d_inode))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); + + if (!IS_ERR(dentry) && dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } + + if (IS_ERR(dentry)) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + + return dentry; +} + +/** + * eventfs_failed_creating - clean up a failed eventfs dentry creation + * @dentry: The dentry to clean up + * + * If after calling eventfs_start_creating(), a failure is detected, the + * resources created by eventfs_start_creating() needs to be cleaned up. In + * that case, this function should be called to perform that clean up. + */ +struct dentry *eventfs_failed_creating(struct dentry *dentry) +{ + dput(dentry); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + return NULL; +} + +/** + * eventfs_end_creating - Finish the process of creating a eventfs dentry + * @dentry: The dentry that has successfully been created. + * + * This function is currently just a place holder to match + * eventfs_start_creating(). In case any synchronization needs to be added, + * this function will be used to implement that without having to modify + * the callers of eventfs_start_creating(). + */ +struct dentry *eventfs_end_creating(struct dentry *dentry) +{ + return dentry; +} + +/** * tracefs_create_file - create a file in the tracefs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. @@ -490,14 +602,14 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); + dentry = tracefs_start_creating(name, parent); if (IS_ERR(dentry)) return NULL; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return failed_creating(dentry); + return tracefs_failed_creating(dentry); inode->i_mode = mode; inode->i_fop = fops ? fops : &tracefs_file_operations; @@ -506,13 +618,13 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return tracefs_end_creating(dentry); } static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) { - struct dentry *dentry = start_creating(name, parent); + struct dentry *dentry = tracefs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) @@ -520,7 +632,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) - return failed_creating(dentry); + return tracefs_failed_creating(dentry); /* Do not set bits for OTH */ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; @@ -534,7 +646,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return tracefs_end_creating(dentry); } /** @@ -556,6 +668,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + return __create_dir(name, parent, &simple_dir_inode_operations); } @@ -628,10 +743,26 @@ bool tracefs_initialized(void) return tracefs_registered; } +static void init_once(void *foo) +{ + struct tracefs_inode *ti = (struct tracefs_inode *) foo; + + inode_init_once(&ti->vfs_inode); +} + static int __init tracefs_init(void) { int retval; + tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache", + sizeof(struct tracefs_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), + init_once); + if (!tracefs_inode_cachep) + return -ENOMEM; + retval = sysfs_create_mount_point(kernel_kobj, "tracing"); if (retval) return -EINVAL; diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h new file mode 100644 index 000000000000..ccee18ca66c7 --- /dev/null +++ b/fs/tracefs/internal.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TRACEFS_INTERNAL_H +#define _TRACEFS_INTERNAL_H + +enum { + TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_TOP_INODE = BIT(2), +}; + +struct tracefs_inode { + unsigned long flags; + void *private; + struct inode vfs_inode; +}; + +/* + * struct eventfs_attr - cache the mode and ownership of a eventfs entry + * @mode: saved mode plus flags of what is saved + * @uid: saved uid if changed + * @gid: saved gid if changed + */ +struct eventfs_attr { + int mode; + kuid_t uid; + kgid_t gid; +}; + +/* + * struct eventfs_inode - hold the properties of the eventfs directories. + * @list: link list into the parent directory + * @entries: the array of entries representing the files in the directory + * @name: the name of the directory to create + * @children: link list into the child eventfs_inode + * @dentry: the dentry of the directory + * @d_parent: pointer to the parent's dentry + * @d_children: The array of dentries to represent the files when created + * @entry_attrs: Saved mode and ownership of the @d_children + * @attr: Saved mode and ownership of eventfs_inode itself + * @data: The private data to pass to the callbacks + * @is_freed: Flag set if the eventfs is on its way to be freed + * Note if is_freed is set, then dentry is corrupted. + * @nr_entries: The number of items in @entries + */ +struct eventfs_inode { + struct list_head list; + const struct eventfs_entry *entries; + const char *name; + struct list_head children; + struct dentry *dentry; /* Check is_freed to access */ + struct dentry *d_parent; + struct dentry **d_children; + struct eventfs_attr *entry_attrs; + struct eventfs_attr attr; + void *data; + /* + * Union - used for deletion + * @llist: for calling dput() if needed after RCU + * @rcu: eventfs_inode to delete in RCU + */ + union { + struct llist_node llist; + struct rcu_head rcu; + }; + unsigned int is_freed:1; + unsigned int nr_entries:31; +}; + +static inline struct tracefs_inode *get_tracefs(const struct inode *inode) +{ + return container_of(inode, struct tracefs_inode, vfs_inode); +} + +struct dentry *tracefs_start_creating(const char *name, struct dentry *parent); +struct dentry *tracefs_end_creating(struct dentry *dentry); +struct dentry *tracefs_failed_creating(struct dentry *dentry); +struct inode *tracefs_get_inode(struct super_block *sb); +struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); +struct dentry *eventfs_failed_creating(struct dentry *dentry); +struct dentry *eventfs_end_creating(struct dentry *dentry); +void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry); + +#endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index e564d5ff8781..0d561ecb6869 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -9,10 +9,9 @@ * This file implements various helper functions for UBIFS authentication support */ -#include <linux/crypto.h> #include <linux/verification.h> #include <crypto/hash.h> -#include <crypto/algapi.h> +#include <crypto/utils.h> #include <keys/user-type.h> #include <keys/asymmetric-type.h> diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 3125e76376ee..921f9033d0d2 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,8 +88,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, } const struct fscrypt_operations ubifs_crypt_operations = { - .flags = FS_CFLG_OWN_PAGES, - .key_prefix = "ubifs:", + .legacy_key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 9c9d3f0e36a4..d013c5b3f1ed 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -237,14 +237,14 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode)); pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode)); pr_err("\tatime %u.%u\n", - (unsigned int)inode->i_atime.tv_sec, - (unsigned int)inode->i_atime.tv_nsec); + (unsigned int) inode_get_atime_sec(inode), + (unsigned int) inode_get_atime_nsec(inode)); pr_err("\tmtime %u.%u\n", - (unsigned int)inode->i_mtime.tv_sec, - (unsigned int)inode->i_mtime.tv_nsec); + (unsigned int) inode_get_mtime_sec(inode), + (unsigned int) inode_get_mtime_nsec(inode)); pr_err("\tctime %u.%u\n", - (unsigned int)inode->i_ctime.tv_sec, - (unsigned int)inode->i_ctime.tv_nsec); + (unsigned int) inode_get_ctime_sec(inode), + (unsigned int) inode_get_ctime_nsec(inode)); pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum); pr_err("\txattr_size %u\n", ui->xattr_size); pr_err("\txattr_cnt %u\n", ui->xattr_cnt); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ef0499edc248..3b13c648d490 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -96,8 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, inode->i_flags |= S_NOCMTIME; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); inode->i_mapping->nrpages = 0; if (!is_xattr) { @@ -325,7 +324,8 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -725,7 +725,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_inode *dir_ui = ubifs_inode(dir); - int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, sz_change; struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; struct fscrypt_name nm; @@ -749,6 +749,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, if (err) return err; + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + err = dbg_check_synced_i_size(c, inode); if (err) goto out_fname; @@ -765,10 +767,11 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); ihold(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -838,11 +841,12 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = current_time(dir); + inode_set_ctime_current(inode); drop_nlink(inode); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -940,12 +944,13 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = current_time(dir); + inode_set_ctime_current(inode); clear_nlink(inode); drop_nlink(dir); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -1019,7 +1024,8 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); @@ -1110,7 +1116,8 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -1210,7 +1217,8 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -1298,7 +1306,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct ubifs_budget_req wht_req; - struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; @@ -1414,8 +1421,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the @i_ctime for inodes on a * rename. */ - time = current_time(old_dir); - old_inode->i_ctime = time; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); /* We must adjust parent link count when renaming directories */ if (is_dir) { @@ -1444,13 +1450,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_size -= old_sz; ubifs_inode(old_dir)->ui_size = old_dir->i_size; - old_dir->i_mtime = old_dir->i_ctime = time; - new_dir->i_mtime = new_dir->i_ctime = time; /* * And finally, if we unlinked a direntry which happened to have the * same name as the moved direntry, we have to decrement @i_nlink of - * the unlinked inode and change its ctime. + * the unlinked inode. */ if (unlink) { /* @@ -1462,7 +1466,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, clear_nlink(new_inode); else drop_nlink(new_inode); - new_inode->i_ctime = time; } else { new_dir->i_size += new_sz; ubifs_inode(new_dir)->ui_size = new_dir->i_size; @@ -1557,7 +1560,6 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); struct inode *fst_inode = d_inode(old_dentry); struct inode *snd_inode = d_inode(new_dentry); - struct timespec64 time; int err; struct fscrypt_name fst_nm, snd_nm; @@ -1588,11 +1590,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, lock_4_inodes(old_dir, new_dir, NULL, NULL); - time = current_time(old_dir); - fst_inode->i_ctime = time; - snd_inode->i_ctime = time; - old_dir->i_mtime = old_dir->i_ctime = time; - new_dir->i_mtime = new_dir->i_ctime = time; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dir != new_dir) { if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) { @@ -1665,7 +1663,7 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 6738fe43040b..2d2b39f843ce 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1088,11 +1088,11 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (attr->ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); err = ubifs_jnl_truncate(c, inode, old_size, new_size); @@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1364,8 +1364,10 @@ out: static inline int mctime_update_needed(const struct inode *inode, const struct timespec64 *now) { - if (!timespec64_equal(&inode->i_mtime, now) || - !timespec64_equal(&inode->i_ctime, now)) + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + + if (!timespec64_equal(&mtime, now) || !timespec64_equal(&ctime, now)) return 1; return 0; } @@ -1373,11 +1375,13 @@ static inline int mctime_update_needed(const struct inode *inode, /** * ubifs_update_time - update time of inode. * @inode: inode to update + * @time: timespec structure to hold the current time value + * @flags: time updating control flag determines updating + * which time fields of @inode * * This function updates time of the inode. */ -int ubifs_update_time(struct inode *inode, struct timespec64 *time, - int flags) +int ubifs_update_time(struct inode *inode, int flags) { struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1385,21 +1389,17 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; int err, release; - if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) - return generic_update_time(inode, time, flags); + if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) { + generic_update_time(inode, flags); + return 0; + } err = ubifs_budget_space(c, &req); if (err) return err; mutex_lock(&ui->ui_mutex); - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - + inode_update_timestamps(inode, flags); release = ui->dirty; __mark_inode_dirty(inode, I_DIRTY_SYNC); mutex_unlock(&ui->ui_mutex); @@ -1432,7 +1432,7 @@ static int update_mctime(struct inode *inode) return err; mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); @@ -1570,7 +1570,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 67c5108abd89..d79cabe193c3 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -118,7 +118,7 @@ static int setflags(struct inode *inode, int flags) ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS); ui->flags |= ioctl2ubifs(flags); ubifs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index dc52ac0f4a34..f0a5538c84b0 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -452,12 +452,12 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, ino->ch.node_type = UBIFS_INO_NODE; ino_key_init_flash(c, &ino->key, inode->i_ino); ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); - ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); - ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec); - ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); - ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); - ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ino->atime_sec = cpu_to_le64(inode_get_atime_sec(inode)); + ino->atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + ino->ctime_sec = cpu_to_le64(inode_get_ctime_sec(inode)); + ino->ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ino->mtime_sec = cpu_to_le64(inode_get_mtime_sec(inode)); + ino->mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ino->uid = cpu_to_le32(i_uid_read(inode)); ino->gid = cpu_to_le32(i_gid_read(inode)); ino->mode = cpu_to_le32(inode->i_mode); @@ -1607,6 +1607,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, ubifs_err(c, "bad data node (block %u, inode %lu)", blk, inode->i_ino); ubifs_dump_node(c, dn, dn_size); + err = -EUCLEAN; goto out_free; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 4211e4456b1e..c59d47fe7939 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -23,7 +23,6 @@ #include "ubifs.h" #include <linux/list_sort.h> #include <crypto/hash.h> -#include <crypto/algapi.h> /** * struct replay_entry - replay list entry. diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 32cb14759796..09e270d6ed02 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -54,11 +54,7 @@ module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_vers static struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ -static struct shrinker ubifs_shrinker_info = { - .scan_objects = ubifs_shrink_scan, - .count_objects = ubifs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *ubifs_shrinker_info; /** * validate_inode - validate inode. @@ -142,12 +138,12 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) set_nlink(inode, le32_to_cpu(ino->nlink)); i_uid_write(inode, le32_to_cpu(ino->uid)); i_gid_write(inode, le32_to_cpu(ino->gid)); - inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); - inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); - inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); - inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); - inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec); - inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec); + inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec), + le32_to_cpu(ino->atime_nsec)); + inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec), + le32_to_cpu(ino->mtime_nsec)); + inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec), + le32_to_cpu(ino->ctime_nsec)); inode->i_mode = le32_to_cpu(ino->mode); inode->i_size = le64_to_cpu(ino->size); @@ -923,8 +919,10 @@ static void free_buds(struct ubifs_info *c) { struct ubifs_bud *bud, *n; - rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) + rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb) { + kfree(bud->log_hash); kfree(bud); + } } /** @@ -1193,6 +1191,7 @@ static void destroy_journal(struct ubifs_info *c) bud = list_entry(c->old_buds.next, struct ubifs_bud, list); list_del(&bud->list); + kfree(bud->log_hash); kfree(bud); } ubifs_destroy_idx_gc(c); @@ -2373,7 +2372,7 @@ static void inode_slab_ctor(void *obj) static int __init ubifs_init(void) { - int err; + int err = -ENOMEM; BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); @@ -2439,10 +2438,15 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab"); - if (err) + ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab"); + if (!ubifs_shrinker_info) goto out_slab; + ubifs_shrinker_info->count_objects = ubifs_shrink_count; + ubifs_shrinker_info->scan_objects = ubifs_shrink_scan; + + shrinker_register(ubifs_shrinker_info); + err = ubifs_compressors_init(); if (err) goto out_shrinker; @@ -2467,7 +2471,7 @@ out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); out_shrinker: - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; @@ -2483,7 +2487,7 @@ static void __exit ubifs_exit(void) dbg_debugfs_exit(); ubifs_sysfs_exit(); ubifs_compressors_exit(); - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); /* * Make sure all delayed rcu free inodes are flushed before we diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 6b7d95b65f4b..f4728e65d1bd 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -65,6 +65,7 @@ static void do_insert_old_idx(struct ubifs_info *c, else { ubifs_err(c, "old idx added twice!"); kfree(old_idx); + return; } } rb_link_node(&old_idx->rb, parent, p); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4c36044140e7..3916dc4f30ca 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -31,7 +31,7 @@ #include <linux/completion.h> #include <crypto/hash_info.h> #include <crypto/hash.h> -#include <crypto/algapi.h> +#include <crypto/utils.h> #include <linux/fscrypt.h> @@ -2027,7 +2027,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); +int ubifs_update_time(struct inode *inode, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, @@ -2043,7 +2043,7 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, size_t size); #ifdef CONFIG_UBIFS_FS_XATTR -extern const struct xattr_handler *ubifs_xattr_handlers[]; +extern const struct xattr_handler * const ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum); int ubifs_purge_xattrs(struct inode *host); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 349228dd1191..0847db521984 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -134,7 +134,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -215,7 +215,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -474,7 +474,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); @@ -735,7 +735,7 @@ static const struct xattr_handler ubifs_security_xattr_handler = { }; #endif -const struct xattr_handler *ubifs_xattr_handlers[] = { +const struct xattr_handler * const ubifs_xattr_handlers[] = { &ubifs_user_xattr_handler, &ubifs_trusted_xattr_handler, #ifdef CONFIG_UBIFS_FS_SECURITY diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 82e8bfa2dfd9..8f7ce30d47fd 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config UDF_FS tristate "UDF file system support" + select BUFFER_HEAD select CRC_ITU_T select NLS select LEGACY_DIRECT_IO diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 1c775e072b2f..93153665eb37 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -95,7 +95,7 @@ static int udf_copy_fi(struct udf_fileident_iter *iter) } off = iter->pos & (blksize - 1); - len = min_t(int, sizeof(struct fileIdentDesc), blksize - off); + len = min_t(u32, sizeof(struct fileIdentDesc), blksize - off); memcpy(&iter->fi, iter->bh[0]->b_data + off, len); if (len < sizeof(struct fileIdentDesc)) memcpy((char *)(&iter->fi) + len, iter->bh[1]->b_data, diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index de17a97e8667..415b050b977d 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -471,7 +471,7 @@ struct fileIdentDesc { uint8_t lengthFileIdent; struct long_ad icb; __le16 lengthOfImpUse; - uint8_t impUse[]; + /* uint8_t impUse[]; */ /* uint8_t fileIdent[]; */ /* uint8_t padding[]; */ } __packed; diff --git a/fs/udf/file.c b/fs/udf/file.c index 243840dc83ad..0ceac4b5937c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -63,13 +63,13 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) else end = PAGE_SIZE; err = __block_write_begin(page, 0, end, udf_get_block); - if (!err) - err = block_commit_write(page, 0, end); - if (err < 0) { + if (err) { unlock_page(page); - ret = block_page_mkwrite_return(err); + ret = vmf_fs_error(err); goto out_unlock; } + + block_commit_write(page, 0, end); out_dirty: set_page_dirty(page); wait_for_stable_page(page); diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 5f7ac8c84798..5f1f969f4134 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -100,8 +100,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - iinfo->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + iinfo->i_crtime = inode_get_mtime(inode); if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 28cdfc57d946..d8493449d4c5 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -352,8 +352,6 @@ int udf_expand_file_adinicb(struct inode *inode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - /* from now on we have normal address_space methods */ - inode->i_data.a_ops = &udf_aops; up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; @@ -910,7 +908,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map) map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; iinfo->i_next_alloc_block = map->lblk + 1; iinfo->i_next_alloc_goal = newblocknum + 1; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); @@ -1298,7 +1296,7 @@ set_size: goto out_unlock; } update_time: - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) udf_sync_inode(inode); else @@ -1329,6 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) int bs = inode->i_sb->s_blocksize; int ret = -EIO; uint32_t uid, gid; + struct timespec64 ts; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { @@ -1505,9 +1504,12 @@ reread: inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime); - udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime); - udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime); + udf_disk_stamp_to_time(&ts, fe->accessTime); + inode_set_atime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, fe->modificationTime); + inode_set_mtime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, fe->attrTime); + inode_set_ctime_to_ts(inode, ts); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); @@ -1519,10 +1521,13 @@ reread: inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime); - udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime); + udf_disk_stamp_to_time(&ts, efe->accessTime); + inode_set_atime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, efe->modificationTime); + inode_set_mtime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, efe->attrTime); + inode_set_ctime_to_ts(inode, ts); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); - udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); @@ -1797,9 +1802,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); - udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); - udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode)); + udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode)); + udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; @@ -1828,14 +1833,16 @@ static int udf_update_inode(struct inode *inode, int do_sync) cpu_to_le32(inode->i_sb->s_blocksize); } - udf_adjust_time(iinfo, inode->i_atime); - udf_adjust_time(iinfo, inode->i_mtime); - udf_adjust_time(iinfo, inode->i_ctime); + udf_adjust_time(iinfo, inode_get_atime(inode)); + udf_adjust_time(iinfo, inode_get_mtime(inode)); + udf_adjust_time(iinfo, inode_get_ctime(inode)); - udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&efe->accessTime, + inode_get_atime(inode)); + udf_time_to_disk_stamp(&efe->modificationTime, + inode_get_mtime(inode)); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); - udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index a95579b043ab..3508ac484da3 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); udf_fiiter_write_fi(&iter, NULL); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, false, 1); @@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); @@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; inode_dec_link_count(dir); udf_add_fid_counter(dir->i_sb, true, -1); - inode->i_ctime = dir->i_ctime = dir->i_mtime = - current_time(inode); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); mark_inode_dirty(dir); ret = 0; end_rmdir: @@ -555,11 +555,11 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) set_nlink(inode, 1); } udf_fiiter_delete_entry(&iter); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_dec_link_count(inode); udf_add_fid_counter(dir->i_sb, false, -1); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); ret = 0; end_unlink: udf_fiiter_release(&iter); @@ -746,9 +746,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); udf_add_fid_counter(dir->i_sb, false, 1); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ihold(inode); d_instantiate(dentry, inode); @@ -833,7 +833,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); /* @@ -861,13 +861,13 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, } if (new_inode) { - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), -1); } - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); - new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); + inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 779b5c2c75f6..f7eaf7b14594 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -149,7 +149,7 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap, struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 9af6ff7f9747..f9a60bc1abcf 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -86,7 +86,7 @@ struct udf_virtual_data { struct udf_bitmap { __u32 s_extPosition; int s_nr_groups; - struct buffer_head *s_block_bitmap[]; + struct buffer_head *s_block_bitmap[] __counted_by(s_nr_groups); }; struct udf_part_map { diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig index 6d30adb6b890..9301e7ecd092 100644 --- a/fs/ufs/Kconfig +++ b/fs/ufs/Kconfig @@ -2,6 +2,7 @@ config UFS_FS tristate "UFS file system support (read only)" depends on BLOCK + select BUFFER_HEAD help BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, OpenBSD and NeXTstep) use a file system called UFS. Some System V diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 2436e3f82147..53c11be2b2c1 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -240,6 +240,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, unsigned int count, sector_t oldb, sector_t newb, struct page *locked_page) { + struct folio *folio, *locked_folio = page_folio(locked_page); const unsigned blks_per_page = 1 << (PAGE_SHIFT - inode->i_blkbits); const unsigned mask = blks_per_page - 1; @@ -247,42 +248,39 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, pgoff_t index, cur_index, last_index; unsigned pos, j, lblock; sector_t end, i; - struct page *page; struct buffer_head *head, *bh; UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", inode->i_ino, count, (unsigned long long)oldb, (unsigned long long)newb); - BUG_ON(!locked_page); - BUG_ON(!PageLocked(locked_page)); + BUG_ON(!folio_test_locked(locked_folio)); - cur_index = locked_page->index; + cur_index = locked_folio->index; end = count + beg; last_index = end >> (PAGE_SHIFT - inode->i_blkbits); for (i = beg; i < end; i = (i | mask) + 1) { index = i >> (PAGE_SHIFT - inode->i_blkbits); if (likely(cur_index != index)) { - page = ufs_get_locked_page(mapping, index); - if (!page)/* it was truncated */ + folio = ufs_get_locked_folio(mapping, index); + if (!folio) /* it was truncated */ continue; - if (IS_ERR(page)) {/* or EIO */ + if (IS_ERR(folio)) {/* or EIO */ ufs_error(inode->i_sb, __func__, "read of page %llu failed\n", (unsigned long long)index); continue; } } else - page = locked_page; + folio = locked_folio; - head = page_buffers(page); + head = folio_buffers(folio); bh = head; pos = i & mask; for (j = 0; j < pos; ++j) bh = bh->b_this_page; - if (unlikely(index == last_index)) lblock = end & mask; else @@ -313,7 +311,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, } while (bh != head); if (likely(cur_index != index)) - ufs_put_locked_page(page); + ufs_put_locked_folio(folio); } UFSD("EXIT\n"); } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 379d75796a5c..27c85d92d1dc 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ufs_handle_dirsync(dir); } @@ -397,7 +397,7 @@ got_it: ufs_set_de_type(sb, de, inode->i_mode); ufs_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = ufs_handle_dirsync(dir); @@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; ufs_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 06bd84d555bd..73531827ecee 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -292,7 +292,7 @@ cg_found: inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + simple_inode_init_ts(inode); ufsi->i_flags = UFS_I(dir)->i_flags; ufsi->i_lastfrag = 0; ufsi->i_shadow = 0; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a4246c83a8cd..ebce93b08281 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -296,7 +296,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, if (new) *new = 1; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); @@ -378,7 +378,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block, mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); out: brelse (bh); @@ -579,12 +579,15 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode)); inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); - inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); - inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); - inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; + inode_set_atime(inode, + (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec), + 0); + inode_set_ctime(inode, + (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec), + 0); + inode_set_mtime(inode, + (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec), + 0); inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); @@ -625,12 +628,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid)); inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); - inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); - inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime); - inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime); - inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec); - inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec); - inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec); + inode_set_atime(inode, fs64_to_cpu(sb, ufs2_inode->ui_atime), + fs32_to_cpu(sb, ufs2_inode->ui_atimensec)); + inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime), + fs32_to_cpu(sb, ufs2_inode->ui_ctimensec)); + inode_set_mtime(inode, fs64_to_cpu(sb, ufs2_inode->ui_mtime), + fs32_to_cpu(sb, ufs2_inode->ui_mtimensec)); inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); @@ -724,11 +727,14 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); - ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); + ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, + inode_get_atime_sec(inode)); ufs_inode->ui_atime.tv_usec = 0; - ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, + inode_get_ctime_sec(inode)); ufs_inode->ui_ctime.tv_usec = 0; - ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec); + ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, + inode_get_mtime_sec(inode)); ufs_inode->ui_mtime.tv_usec = 0; ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks); ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); @@ -768,12 +774,15 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode)); ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); - ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); - ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec); - ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec); - ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec); - ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec); - ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec); + ufs_inode->ui_atime = cpu_to_fs64(sb, inode_get_atime_sec(inode)); + ufs_inode->ui_atimensec = cpu_to_fs32(sb, + inode_get_atime_nsec(inode)); + ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime_sec(inode)); + ufs_inode->ui_ctimensec = cpu_to_fs32(sb, + inode_get_ctime_nsec(inode)); + ufs_inode->ui_mtime = cpu_to_fs64(sb, inode_get_mtime_sec(inode)); + ufs_inode->ui_mtimensec = cpu_to_fs32(sb, + inode_get_mtime_nsec(inode)); ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks); ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); @@ -1054,7 +1063,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; unsigned i, end; sector_t lastfrag; - struct page *lastpage; + struct folio *folio; struct buffer_head *bh; u64 phys64; @@ -1065,18 +1074,17 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) lastfrag--; - lastpage = ufs_get_locked_page(mapping, lastfrag >> + folio = ufs_get_locked_folio(mapping, lastfrag >> (PAGE_SHIFT - inode->i_blkbits)); - if (IS_ERR(lastpage)) { - err = -EIO; - goto out; - } - - end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1); - bh = page_buffers(lastpage); - for (i = 0; i < end; ++i) - bh = bh->b_this_page; + if (IS_ERR(folio)) { + err = -EIO; + goto out; + } + end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1); + bh = folio_buffers(folio); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; err = ufs_getfrag_block(inode, lastfrag, bh, 1); @@ -1092,7 +1100,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) */ set_buffer_uptodate(bh); mark_buffer_dirty(bh); - set_page_dirty(lastpage); + folio_mark_dirty(folio); } if (lastfrag >= UFS_IND_FRAGMENT) { @@ -1110,7 +1118,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) } } out_unlock: - ufs_put_locked_page(lastpage); + ufs_put_locked_folio(folio); out: return err; } @@ -1205,7 +1213,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); ufs_truncate_blocks(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); out: UFSD("EXIT: err %d\n", err); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 36154b5aca6d..9cad29463791 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -153,7 +153,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -220,7 +220,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); err = 0; out: @@ -282,7 +282,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; ufs_set_link(new_dir, new_de, new_page, old_inode, 1); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -298,7 +298,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); ufs_delete_entry(old_dir, old_de, old_page); mark_inode_dirty(old_inode); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 23377c1baed9..a480810cd4e3 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -137,6 +137,7 @@ static struct dentry *ufs_get_parent(struct dentry *child) } static const struct export_operations ufs_export_ops = { + .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ufs_fh_to_dentry, .fh_to_parent = ufs_fh_to_parent, .get_parent = ufs_get_parent, diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 08ddf41eaaad..2acf191eb89e 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -230,42 +230,40 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev } /** - * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist + * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist * read it from disk. * @mapping: the address_space to search * @index: the page index * - * Locates the desired pagecache page, if not exist we'll read it, + * Locates the desired pagecache folio, if not exist we'll read it, * locks it, increments its reference * count and returns its address. * */ - -struct page *ufs_get_locked_page(struct address_space *mapping, +struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index) { struct inode *inode = mapping->host; - struct page *page = find_lock_page(mapping, index); - if (!page) { - page = read_mapping_page(mapping, index, NULL); + struct folio *folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + folio = read_mapping_folio(mapping, index, NULL); - if (IS_ERR(page)) { - printk(KERN_ERR "ufs_change_blocknr: " - "read_mapping_page error: ino %lu, index: %lu\n", + if (IS_ERR(folio)) { + printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n", mapping->host->i_ino, index); - return page; + return folio; } - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping == NULL)) { + if (unlikely(folio->mapping == NULL)) { /* Truncate got there first */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return NULL; } } - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); - return page; + if (!folio_buffers(folio)) + create_empty_buffers(folio, 1 << inode->i_blkbits, 0); + return folio; } diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 4931bec1a01c..0ecd2ed792f5 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -11,12 +11,6 @@ #include <linux/fs.h> #include "swab.h" - -/* - * some useful macros - */ -#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) - /* * functions used for retyping */ @@ -279,15 +273,13 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); /* This functions works with cache pages*/ -extern struct page *ufs_get_locked_page(struct address_space *mapping, - pgoff_t index); -static inline void ufs_put_locked_page(struct page *page) +struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); +static inline void ufs_put_locked_folio(struct folio *folio) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } - /* * macros and inline function to get important structures from ufs_sb_private_info */ diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7cecd49e078b..e8af40b05549 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = { }; #endif -static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; +static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; /* * Start with fault_pending_wqh and fault_wqh so they're more likely @@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) +{ + return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); +} + /* * Whether WP_UNPOPULATED is enabled on the uffd context. It is only * meaningful when userfaultfd_wp()==true on the vma and when it's @@ -277,17 +282,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { + struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(ctx->mm); + assert_fault_locked(vmf); - ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) goto out; @@ -308,10 +312,8 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { return false; /* should never get here */ } @@ -325,11 +327,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, * threads. */ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, - unsigned long address, - unsigned long flags, + struct vm_fault *vmf, unsigned long reason) { struct mm_struct *mm = ctx->mm; + unsigned long address = vmf->address; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -338,7 +340,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte_t ptent; bool ret = true; - mmap_assert_locked(mm); + assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -427,20 +429,16 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * * We also don't do userfault handling during * coredumping. hugetlbfs has the special - * follow_hugetlb_page() to skip missing pages in the + * hugetlb_follow_page_mask() to skip missing pages in the * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during - * coredumping without mmap_lock and it ends up here. + * coredumping and it ends up here. */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; - /* - * Coredumping runs without mmap_lock so we can only check that - * the mmap_lock is held, if PF_DUMPCORE was not set. - */ - mmap_assert_locked(mm); + assert_fault_locked(vmf); ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) @@ -556,15 +554,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) spin_unlock_irq(&ctx->fault_pending_wqh.lock); if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + must_wait = userfaultfd_must_wait(ctx, vmf, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vma, - vmf->address, - vmf->flags, reason); + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); - mmap_read_unlock(mm); + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); @@ -667,6 +662,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); @@ -702,6 +698,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; @@ -783,6 +780,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } @@ -929,19 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - } else { - prev = vma; - } + vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, + vma->vm_end, new_flags, + NULL_VM_UFFD_CTX); + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + prev = vma; } mmap_write_unlock(mm); mmput(mm); @@ -1289,13 +1283,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, __wake_userfault(ctx, range); } -static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) +static __always_inline int validate_unaligned_range( + struct mm_struct *mm, __u64 start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) - return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) @@ -1306,9 +1298,20 @@ static __always_inline int validate_range(struct mm_struct *mm, return -EINVAL; if (len > task_size - start) return -EINVAL; + if (start + len <= start) + return -EINVAL; return 0; } +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + if (start & ~PAGE_MASK) + return -EINVAL; + + return validate_unaligned_range(mm, start, len); +} + static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1322,7 +1325,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool basic_ioctls; unsigned long start, end, vma_end; struct vma_iterator vmi; - pgoff_t pgoff; + bool wp_async = userfaultfd_wp_async_ctx(ctx); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1396,7 +1399,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur, vm_flags)) + if (!vma_can_userfault(cur, vm_flags, wp_async)) goto out_unlock; /* @@ -1457,7 +1460,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vm_flags)); + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1475,33 +1478,20 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx }), - anon_vma_name(vma)); - if (prev) { - /* vma_merge() invalidated the mas */ - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; @@ -1557,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; - pgoff_t pgoff; + bool wp_async = userfaultfd_wp_async_ctx(ctx); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1611,7 +1601,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur, cur->vm_flags)) + if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) goto out_unlock; found = true; @@ -1627,7 +1617,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); /* * Nothing to do: this vma is already registered into this @@ -1660,31 +1650,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, NULL_VM_UFFD_CTX); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -1757,17 +1735,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; + ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, + uffdio_copy.len); + if (ret) + goto out; ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; - /* - * double check for wraparound just in case. copy_from_user() - * will later check uffdio_copy.src + uffdio_copy.len to fit - * in the userland range. - */ + ret = -EINVAL; - if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) - goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) @@ -1927,11 +1903,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; ret = -EINVAL; - /* double check for wraparound just in case. */ - if (uffdio_continue.range.start + uffdio_continue.range.len <= - uffdio_continue.range.start) { - goto out; - } if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | UFFDIO_CONTINUE_MODE_WP)) goto out; @@ -1965,6 +1936,66 @@ out: return ret; } +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_poison uffdio_poison; + struct uffdio_poison __user *user_uffdio_poison; + struct userfaultfd_wake_range range; + + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, + /* don't copy the output fields */ + sizeof(uffdio_poison) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len, + &ctx->mmap_changing, 0); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { + range.start = uffdio_poison.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + +bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -1998,6 +2029,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; + + /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ + if (features & UFFD_FEATURE_WP_ASYNC) + features |= UFFD_FEATURE_WP_UNPOPULATED; + /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR @@ -2010,6 +2046,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; @@ -2066,6 +2103,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_CONTINUE: ret = userfaultfd_continue(ctx, arg); break; + case UFFDIO_POISON: + ret = userfaultfd_poison(ctx, arg); + break; } return ret; } diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index dd0ae1188e87..72ac9320e6a3 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -126,12 +126,12 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, do_div(allocated, 512); inode->i_blocks = allocated; - inode->i_atime = ns_to_timespec64( - info->access_time.ns_relative_to_unix_epoch); - inode->i_ctime = ns_to_timespec64( - info->change_time.ns_relative_to_unix_epoch); - inode->i_mtime = ns_to_timespec64( - info->modification_time.ns_relative_to_unix_epoch); + inode_set_atime_to_ts(inode, + ns_to_timespec64(info->access_time.ns_relative_to_unix_epoch)); + inode_set_ctime_to_ts(inode, + ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch)); + inode_set_mtime_to_ts(inode, + ns_to_timespec64(info->modification_time.ns_relative_to_unix_epoch)); return 0; } @@ -194,7 +194,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry) struct vboxsf_sbi *sbi; struct vboxsf_inode *sf_i; struct shfl_fsobjinfo info; - struct timespec64 prev_mtime; + struct timespec64 mtime, prev_mtime; struct inode *inode; int err; @@ -202,7 +202,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry) return -EINVAL; inode = d_inode(dentry); - prev_mtime = inode->i_mtime; + prev_mtime = inode_get_mtime(inode); sf_i = VBOXSF_I(inode); sbi = VBOXSF_SBI(dentry->d_sb); if (!sf_i->force_restat) { @@ -225,7 +225,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry) * page-cache for it. Note this also gets triggered by our own writes, * this is unavoidable. */ - if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0) + mtime = inode_get_mtime(inode); + if (timespec64_compare(&mtime, &prev_mtime) > 0) invalidate_inode_pages2(inode->i_mapping); return 0; @@ -252,7 +253,7 @@ int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, if (err) return err; - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), kstat); return 0; } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 49bf3a1eb2a0..d071a6e32581 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -118,16 +118,16 @@ void fsverity_free_info(struct fsverity_info *vi); int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret); -int __init fsverity_init_info_cache(void); -void __init fsverity_exit_info_cache(void); +void __init fsverity_init_info_cache(void); /* signature.c */ #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES +extern int fsverity_require_signatures; int fsverity_verify_signature(const struct fsverity_info *vi, const u8 *signature, size_t sig_size); -int __init fsverity_init_signature(void); +void __init fsverity_init_signature(void); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int fsverity_verify_signature(const struct fsverity_info *vi, @@ -136,15 +136,13 @@ fsverity_verify_signature(const struct fsverity_info *vi, return 0; } -static inline int fsverity_init_signature(void) +static inline void fsverity_init_signature(void) { - return 0; } #endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ /* verify.c */ -int __init fsverity_init_workqueue(void); -void __init fsverity_exit_workqueue(void); +void __init fsverity_init_workqueue(void); #endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index c598d2035476..6b08b1d9a7d7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -226,6 +226,14 @@ void __init fsverity_check_hash_algs(void) if (!alg->name) continue; + /* + * 0 must never be allocated as an FS_VERITY_HASH_ALG_* value, + * as it is reserved for users that use 0 to mean unspecified or + * a default value. fs/verity/ itself doesn't care and doesn't + * have a default algorithm, but some users make use of this. + */ + BUG_ON(i == 0); + BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE); /* diff --git a/fs/verity/init.c b/fs/verity/init.c index 023905151035..a29f062f6047 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -9,6 +9,37 @@ #include <linux/ratelimit.h> +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fsverity_sysctl_header; + +static struct ctl_table fsverity_sysctl_table[] = { +#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES + { + .procname = "require_signatures", + .data = &fsverity_require_signatures, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; + +static void __init fsverity_init_sysctl(void) +{ + fsverity_sysctl_header = register_sysctl("fs/verity", + fsverity_sysctl_table); + if (!fsverity_sysctl_header) + panic("fsverity sysctl registration failed"); +} +#else /* CONFIG_SYSCTL */ +static inline void fsverity_init_sysctl(void) +{ +} +#endif /* !CONFIG_SYSCTL */ + void fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...) { @@ -33,28 +64,11 @@ void fsverity_msg(const struct inode *inode, const char *level, static int __init fsverity_init(void) { - int err; - fsverity_check_hash_algs(); - - err = fsverity_init_info_cache(); - if (err) - return err; - - err = fsverity_init_workqueue(); - if (err) - goto err_exit_info_cache; - - err = fsverity_init_signature(); - if (err) - goto err_exit_workqueue; - + fsverity_init_info_cache(); + fsverity_init_workqueue(); + fsverity_init_sysctl(); + fsverity_init_signature(); return 0; - -err_exit_workqueue: - fsverity_exit_workqueue(); -err_exit_info_cache: - fsverity_exit_info_cache(); - return err; } late_initcall(fsverity_init) diff --git a/fs/verity/open.c b/fs/verity/open.c index 1db5106a9c38..6c31a871b84b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -408,18 +408,10 @@ void __fsverity_cleanup_inode(struct inode *inode) } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); -int __init fsverity_init_info_cache(void) +void __init fsverity_init_info_cache(void) { - fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, - SLAB_RECLAIM_ACCOUNT, - file_digest); - if (!fsverity_info_cachep) - return -ENOMEM; - return 0; -} - -void __init fsverity_exit_info_cache(void) -{ - kmem_cache_destroy(fsverity_info_cachep); - fsverity_info_cachep = NULL; + fsverity_info_cachep = KMEM_CACHE_USERCOPY( + fsverity_info, + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC, + file_digest); } diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 72034bc71c9d..90c07573dd77 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -24,7 +24,7 @@ * /proc/sys/fs/verity/require_signatures * If 1, all verity files must have a valid builtin signature. */ -static int fsverity_require_signatures; +int fsverity_require_signatures; /* * Keyring that contains the trusted X.509 certificates. @@ -62,6 +62,22 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } + if (fsverity_keyring->keys.nr_leaves_on_tree == 0) { + /* + * The ".fs-verity" keyring is empty, due to builtin signatures + * being supported by the kernel but not actually being used. + * In this case, verify_pkcs7_signature() would always return an + * error, usually ENOKEY. It could also be EBADMSG if the + * PKCS#7 is malformed, but that isn't very important to + * distinguish. So, just skip to ENOKEY to avoid the attack + * surface of the PKCS#7 parser, which would otherwise be + * reachable by any task able to execute FS_IOC_ENABLE_VERITY. + */ + fsverity_err(inode, + "fs-verity keyring is empty, rejecting signed file!"); + return -ENOKEY; + } + d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); if (!d) return -ENOMEM; @@ -93,59 +109,14 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *fsverity_sysctl_header; - -static struct ctl_table fsverity_sysctl_table[] = { - { - .procname = "require_signatures", - .data = &fsverity_require_signatures, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { } -}; - -static int __init fsverity_sysctl_init(void) -{ - fsverity_sysctl_header = register_sysctl("fs/verity", fsverity_sysctl_table); - if (!fsverity_sysctl_header) { - pr_err("sysctl registration failed!\n"); - return -ENOMEM; - } - return 0; -} -#else /* !CONFIG_SYSCTL */ -static inline int __init fsverity_sysctl_init(void) +void __init fsverity_init_signature(void) { - return 0; -} -#endif /* !CONFIG_SYSCTL */ - -int __init fsverity_init_signature(void) -{ - struct key *ring; - int err; - - ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), KEY_POS_SEARCH | + fsverity_keyring = + keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | KEY_USR_SEARCH | KEY_USR_SETATTR, - KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); - if (IS_ERR(ring)) - return PTR_ERR(ring); - - err = fsverity_sysctl_init(); - if (err) - goto err_put_ring; - - fsverity_keyring = ring; - return 0; - -err_put_ring: - key_put(ring); - return err; + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(fsverity_keyring)) + panic("failed to allocate \".fs-verity\" keyring"); } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 433cef51f5f6..904ccd7e8e16 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -346,7 +346,7 @@ void fsverity_enqueue_verify_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work); -int __init fsverity_init_workqueue(void) +void __init fsverity_init_workqueue(void) { /* * Use a high-priority workqueue to prioritize verification work, which @@ -360,12 +360,5 @@ int __init fsverity_init_workqueue(void) WQ_HIGHPRI, num_online_cpus()); if (!fsverity_read_workqueue) - return -ENOMEM; - return 0; -} - -void __init fsverity_exit_workqueue(void) -{ - destroy_workqueue(fsverity_read_workqueue); - fsverity_read_workqueue = NULL; + panic("failed to allocate fsverity_read_queue"); } diff --git a/fs/xattr.c b/fs/xattr.c index e7bbb7f57557..09d927603433 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -56,7 +56,7 @@ strcmp_prefix(const char *a, const char *a_prefix) static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { - const struct xattr_handler **handlers = inode->i_sb->s_xattr; + const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { @@ -162,7 +162,7 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode, int xattr_supports_user_prefix(struct inode *inode) { - const struct xattr_handler **handlers = inode->i_sb->s_xattr; + const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { @@ -999,7 +999,7 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; + const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; int err = 0; @@ -1040,12 +1040,32 @@ const char *xattr_full_name(const struct xattr_handler *handler, EXPORT_SYMBOL(xattr_full_name); /** - * free_simple_xattr - free an xattr object + * simple_xattr_space - estimate the memory used by a simple xattr + * @name: the full name of the xattr + * @size: the size of its value + * + * This takes no account of how much larger the two slab objects actually are: + * that would depend on the slab implementation, when what is required is a + * deterministic number, which grows with name length and size and quantity. + * + * Return: The approximate number of bytes of memory used by such an xattr. + */ +size_t simple_xattr_space(const char *name, size_t size) +{ + /* + * Use "40" instead of sizeof(struct simple_xattr), to return the + * same result on 32-bit and 64-bit, and even if simple_xattr grows. + */ + return 40 + size + strlen(name); +} + +/** + * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ -static inline void free_simple_xattr(struct simple_xattr *xattr) +void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); @@ -1073,7 +1093,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) if (len < sizeof(*new_xattr)) return NULL; - new_xattr = kvmalloc(len, GFP_KERNEL); + new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; @@ -1164,7 +1184,6 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr - * @removed_size: the size of the removed xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE @@ -1181,29 +1200,27 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * - * Return: On success zero and on error a negative error code is returned. + * Return: On success, the removed or replaced xattr is returned, to be freed + * by the caller; or NULL if none. On failure a negative error code is returned. */ -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags, - ssize_t *removed_size) +struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, + const char *name, const void *value, + size_t size, int flags) { - struct simple_xattr *xattr = NULL, *new_xattr = NULL; + struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; - if (removed_size) - *removed_size = -1; - /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - new_xattr->name = kstrdup(name, GFP_KERNEL); + new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { - free_simple_xattr(new_xattr); - return -ENOMEM; + simple_xattr_free(new_xattr); + return ERR_PTR(-ENOMEM); } } @@ -1217,12 +1234,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, else if (ret > 0) rbp = &(*rbp)->rb_right; else - xattr = rb_entry(*rbp, struct simple_xattr, rb_node); - if (xattr) + old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); + if (old_xattr) break; } - if (xattr) { + if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; @@ -1230,12 +1247,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, } if (new_xattr) - rb_replace_node(&xattr->rb_node, &new_xattr->rb_node, - &xattrs->rb_root); + rb_replace_node(&old_xattr->rb_node, + &new_xattr->rb_node, &xattrs->rb_root); else - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (!err && removed_size) - *removed_size = xattr->size; + rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { @@ -1260,12 +1275,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, out_unlock: write_unlock(&xattrs->lock); - if (err) - free_simple_xattr(new_xattr); - else - free_simple_xattr(xattr); - return err; - + if (!err) + return old_xattr; + simple_xattr_free(new_xattr); + return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) @@ -1370,14 +1383,17 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy + * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ -void simple_xattrs_free(struct simple_xattrs *xattrs) +void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; + if (freed_space) + *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; @@ -1386,7 +1402,10 @@ void simple_xattrs_free(struct simple_xattrs *xattrs) rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); - free_simple_xattr(xattr); + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, + xattr->size); + simple_xattr_free(xattr); rbp = rbp_next; } } diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 52e1823241fb..567fb37274d3 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -128,6 +128,7 @@ config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS + depends on TMPFS && SHMEM select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a @@ -142,6 +143,23 @@ config XFS_ONLINE_SCRUB If unsure, say N. +config XFS_ONLINE_SCRUB_STATS + bool "XFS online metadata check usage data collection" + default y + depends on XFS_ONLINE_SCRUB + select DEBUG_FS + help + If you say Y here, the kernel will gather usage data about + the online metadata check subsystem. This includes the number + of invocations, the outcomes, and the results of repairs, if any. + This may slow down scrub slightly due to the use of high precision + timers and the need to merge per-invocation information into the + filesystem counters. + + Usage data are collected in /sys/kernel/debug/xfs/scrub. + + If unsure, say N. + config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" default n diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 16e4eb431230..7762c01a85cf 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -164,15 +164,24 @@ xfs-y += $(addprefix scrub/, \ rmap.o \ scrub.o \ symlink.o \ + xfarray.o \ + xfile.o \ + ) + +xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap.o \ + rtsummary.o \ ) -xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + reap.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index e9cc481b4ddf..f9f4d694640d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -1001,6 +1001,12 @@ xfs_ag_shrink_space( error = -ENOSPC; goto resv_init_out; } + + /* Update perag geometry */ + pag->block_count -= delta; + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); + xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3069194527dd..100ab5931b31 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2275,16 +2275,37 @@ xfs_alloc_min_freelist( ASSERT(mp->m_alloc_maxlevels > 0); + /* + * For a btree shorter than the maximum height, the worst case is that + * every level gets split and a new level is added, then while inserting + * another entry to refill the AGFL, every level under the old root gets + * split again. This is: + * + * (full height split reservation) + (AGFL refill split height) + * = (current height + 1) + (current height - 1) + * = (new height) + (new height - 2) + * = 2 * new height - 2 + * + * For a btree of maximum height, the worst case is that every level + * under the root gets split, then while inserting another entry to + * refill the AGFL, every level under the root gets split again. This is + * also: + * + * 2 * (current height - 1) + * = 2 * (new height - 1) + * = 2 * new height - 2 + */ + /* space needed by-bno freespace btree */ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + mp->m_rmap_maxlevels) * 2 - 2; return min_free; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 30c931b38853..be62acffad6c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -21,7 +21,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_quota.h" @@ -2989,7 +2989,7 @@ xfs_bmap_extsize_align( * If realtime, and the result isn't a multiple of the realtime * extent size we need to remove blocks until it is. */ - if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) { + if (rt && (temp = xfs_extlen_to_rtxmod(mp, align_alen))) { /* * We're not covering the original request, or * we won't be able to once we fix the length. @@ -3016,7 +3016,7 @@ xfs_bmap_extsize_align( else { align_alen -= orig_off - align_off; align_off = orig_off; - align_alen -= align_alen % mp->m_sb.sb_rextsize; + align_alen -= xfs_extlen_to_rtxmod(mp, align_alen); } /* * Result doesn't cover the request, fail it. @@ -4826,12 +4826,8 @@ xfs_bmap_del_extent_delay( ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); - if (isrt) { - uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); - - do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_frextents(mp, rtexts); - } + if (isrt) + xfs_mod_frextents(mp, xfs_rtb_to_rtx(mp, del->br_blockcount)); /* * Update the inode delalloc counter now and wait to update the @@ -5057,33 +5053,20 @@ xfs_bmap_del_extent_real( flags = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_filblks_t len; - xfs_extlen_t mod; - - len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); - if (!(bflags & XFS_BMAPI_REMAP)) { - xfs_fsblock_t bno; - - bno = div_u64_rem(del->br_startblock, - mp->m_sb.sb_rextsize, &mod); - ASSERT(mod == 0); - - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + error = xfs_rtfree_blocks(tp, del->br_startblock, + del->br_blockcount); if (error) goto done; } do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; } else { do_fx = 1; - nblks = del->br_blockcount; qfield = XFS_TRANS_DQ_BCOUNT; } + nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; if (cur) { @@ -5289,7 +5272,6 @@ __xfs_bunmapi( int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ - xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t end; struct xfs_iext_cursor icur; @@ -5384,8 +5366,8 @@ __xfs_bunmapi( if (!isrt) goto delete; - sum = del.br_startblock + del.br_blockcount; - div_u64_rem(sum, mp->m_sb.sb_rextsize, &mod); + mod = xfs_rtb_to_rtxoff(mp, + del.br_startblock + del.br_blockcount); if (mod) { /* * Realtime extent not lined up at the end. @@ -5432,7 +5414,8 @@ __xfs_bunmapi( goto error0; goto nodelete; } - div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod); + + mod = xfs_rtb_to_rtxoff(mp, del.br_startblock); if (mod) { xfs_extlen_t off = mp->m_sb.sb_rextsize - mod; @@ -6209,8 +6192,8 @@ xfs_bmap_validate_extent( return __this_address; if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { - if (!xfs_verify_rtext(mp, irec->br_startblock, - irec->br_blockcount)) + if (!xfs_verify_rtbext(mp, irec->br_startblock, + irec->br_blockcount)) return __this_address; } else { if (!xfs_verify_fsbext(mp, irec->br_startblock, diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index bcfb6a4203cd..f71679ce23b9 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -245,21 +245,18 @@ xfs_defer_create_intents( return ret; } -/* Abort all the intents that were committed. */ STATIC void -xfs_defer_trans_abort( - struct xfs_trans *tp, - struct list_head *dop_pending) +xfs_defer_pending_abort( + struct xfs_mount *mp, + struct list_head *dop_list) { struct xfs_defer_pending *dfp; const struct xfs_defer_op_type *ops; - trace_xfs_defer_trans_abort(tp, _RET_IP_); - /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_pending, dfp_list) { + list_for_each_entry(dfp, dop_list, dfp_list) { ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(tp->t_mountp, dfp); + trace_xfs_defer_pending_abort(mp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; @@ -267,6 +264,16 @@ xfs_defer_trans_abort( } } +/* Abort all the intents that were committed. */ +STATIC void +xfs_defer_trans_abort( + struct xfs_trans *tp, + struct list_head *dop_pending) +{ + trace_xfs_defer_trans_abort(tp, _RET_IP_); + xfs_defer_pending_abort(tp->t_mountp, dop_pending); +} + /* * Capture resources that the caller said not to release ("held") when the * transaction commits. Caller is responsible for zero-initializing @dres. @@ -756,12 +763,13 @@ xfs_defer_ops_capture( /* Release all resources that we used to capture deferred ops. */ void -xfs_defer_ops_capture_free( +xfs_defer_ops_capture_abort( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { unsigned short i; + xfs_defer_pending_abort(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) @@ -802,7 +810,7 @@ xfs_defer_ops_capture_and_commit( /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); return error; } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 114a3a4930a3..8788ad5f6a73 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, struct list_head *capture_list); void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, struct xfs_defer_resources *dres); -void xfs_defer_ops_capture_free(struct xfs_mount *mp, +void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 371dc07233e0..9a88aba1589f 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -98,7 +98,7 @@ typedef struct xfs_sb { uint32_t sb_blocksize; /* logical block size, bytes */ xfs_rfsblock_t sb_dblocks; /* number of data blocks */ xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ - xfs_rtblock_t sb_rextents; /* number of realtime extents */ + xfs_rtbxlen_t sb_rextents; /* number of realtime extents */ uuid_t sb_uuid; /* user-visible file system unique id */ xfs_fsblock_t sb_logstart; /* starting block of log if internal */ xfs_ino_t sb_rootino; /* root inode number */ @@ -691,6 +691,22 @@ struct xfs_agfl { xfs_daddr_to_agno(mp, (d) + (len) - 1))) /* + * Realtime bitmap information is accessed by the word, which is currently + * stored in host-endian format. + */ +union xfs_rtword_raw { + __u32 old; +}; + +/* + * Realtime summary counts are accessed by the word, which is currently + * stored in host-endian format. + */ +union xfs_suminfo_raw { + __u32 old; +}; + +/* * XFS Timestamps * ============== * @@ -1142,24 +1158,10 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) #define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) -#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) -#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) /* - * RT Summary and bit manipulation macros. + * RT bit manipulation macros. */ -#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) -#define XFS_SUMOFFSTOBLOCK(mp,s) \ - (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) -#define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((bp)->b_addr + \ - (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) - -#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) -#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) -#define XFS_BITTOWORD(mp,bi) \ - ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) - #define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) #define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2cbf9ea39b8c..6360073865db 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -743,7 +743,11 @@ struct xfs_scrub_metadata { */ #define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7) -#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) +/* i: Rebuild the data structure. */ +#define XFS_SCRUB_IFLAG_FORCE_REBUILD (1u << 8) + +#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR | \ + XFS_SCRUB_IFLAG_FORCE_REBUILD) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ XFS_SCRUB_OFLAG_PREEN | \ XFS_SCRUB_OFLAG_XFAIL | \ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 758aacd8166b..137a65bda95d 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -220,9 +220,12 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); - inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); - inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); + inode_set_atime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_atime)); + inode_set_mtime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_mtime)); + inode_set_ctime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_ctime)); ip->i_disk_size = be64_to_cpu(from->di_size); ip->i_nblocks = be64_to_cpu(from->di_nblocks); @@ -314,9 +317,9 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); - to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); + to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode)); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode)); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); @@ -507,6 +510,9 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 2420865f3007..a5100a11faf9 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log); #define xlog_check_buf_cancel_table(log) do { } while (0) #endif +/* + * Transform a regular reservation into one suitable for recovery of a log + * intent item. + * + * Intent recovery only runs a single step of the transaction chain and defers + * the rest to a separate transaction. Therefore, we reduce logcount to 1 here + * to avoid livelocks if the log grant space is nearly exhausted due to the + * recovered intent pinning the tail. Keep the same logflags to avoid tripping + * asserts elsewhere. Struct copies abound below. + */ +static inline struct xfs_trans_res +xlog_recover_resv(const struct xfs_trans_res *r) +{ + struct xfs_trans_res ret = { + .tr_logres = r->tr_logres, + .tr_logcount = 1, + .tr_logflags = r->tr_logflags, + }; + + return ret; +} + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index fa180ab66b73..c269d704314d 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -16,6 +16,7 @@ #include "xfs_trans.h" #include "xfs_rtalloc.h" #include "xfs_error.h" +#include "xfs_rtbitmap.h" /* * Realtime allocator bitmap functions shared with userspace. @@ -46,25 +47,69 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { .verify_write = xfs_rtbuf_verify_write, }; +/* Release cached rt bitmap and summary buffers. */ +void +xfs_rtbuf_cache_relse( + struct xfs_rtalloc_args *args) +{ + if (args->rbmbp) { + xfs_trans_brelse(args->tp, args->rbmbp); + args->rbmbp = NULL; + args->rbmoff = NULLFILEOFF; + } + if (args->sumbp) { + xfs_trans_brelse(args->tp, args->sumbp); + args->sumbp = NULL; + args->sumoff = NULLFILEOFF; + } +} + /* * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ int xfs_rtbuf_get( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t block, /* block number in bitmap or summary */ - int issum, /* is summary not bitmap */ - struct xfs_buf **bpp) /* output: buffer for the block */ + struct xfs_rtalloc_args *args, + xfs_fileoff_t block, /* block number in bitmap or summary */ + int issum) /* is summary not bitmap */ { - struct xfs_buf *bp; /* block buffer, result */ - xfs_inode_t *ip; /* bitmap or summary inode */ - xfs_bmbt_irec_t map; - int nmap = 1; - int error; /* error value */ + struct xfs_mount *mp = args->mp; + struct xfs_buf **cbpp; /* cached block buffer */ + xfs_fileoff_t *coffp; /* cached block number */ + struct xfs_buf *bp; /* block buffer, result */ + struct xfs_inode *ip; /* bitmap or summary inode */ + struct xfs_bmbt_irec map; + enum xfs_blft type; + int nmap = 1; + int error; - ip = issum ? mp->m_rsumip : mp->m_rbmip; + if (issum) { + cbpp = &args->sumbp; + coffp = &args->sumoff; + ip = mp->m_rsumip; + type = XFS_BLFT_RTSUMMARY_BUF; + } else { + cbpp = &args->rbmbp; + coffp = &args->rbmoff; + ip = mp->m_rbmip; + type = XFS_BLFT_RTBITMAP_BUF; + } + + /* + * If we have a cached buffer, and the block number matches, use that. + */ + if (*cbpp && *coffp == block) + return 0; + + /* + * Otherwise we have to have to get the buffer. If there was an old + * one, get rid of it first. + */ + if (*cbpp) { + xfs_trans_brelse(args->tp, *cbpp); + *cbpp = NULL; + } error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0); if (error) @@ -74,15 +119,15 @@ xfs_rtbuf_get( return -EFSCORRUPTED; ASSERT(map.br_startblock != NULLFSBLOCK); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); if (error) return error; - xfs_trans_buf_set_type(tp, bp, issum ? XFS_BLFT_RTSUMMARY_BUF - : XFS_BLFT_RTBITMAP_BUF); - *bpp = bp; + xfs_trans_buf_set_type(args->tp, bp, type); + *cbpp = bp; + *coffp = block; return 0; } @@ -92,47 +137,44 @@ xfs_rtbuf_get( */ int xfs_rtfind_back( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to look at */ + xfs_rtxnum_t limit, /* last rtext to look at */ + xfs_rtxnum_t *rtx) /* out: start rtext found */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t firstbit; /* first useful bit in the word */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; /* error value */ + xfs_rtxnum_t firstbit; /* first useful bit in the word */ + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute and read in starting bitmap block for starting block. */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + block = xfs_rtx_to_rbmblock(mp, start); + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Get the first word's index & point to it. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); len = start - limit + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + incore = xfs_rtbitmap_getword(args, word); + want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0; /* * If the starting position is not word-aligned, deal with the * partial word. @@ -149,13 +191,12 @@ xfs_rtfind_back( * Calculate the difference between the value there * and what we're looking for. */ - if ((wdiff = (*b ^ want) & mask)) { + if ((wdiff = (incore ^ want) & mask)) { /* * Different. Mark where we are and return. */ - xfs_trans_brelse(tp, bp); i = bit - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } i = bit - firstbit + 1; @@ -167,19 +208,11 @@ xfs_rtfind_back( /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, --block); + if (error) return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; + + word = mp->m_blockwsize - 1; } } else { /* @@ -195,13 +228,13 @@ xfs_rtfind_back( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ want)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ want)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } i += XFS_NBWORD; @@ -213,19 +246,11 @@ xfs_rtfind_back( /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, --block); + if (error) return error; - } - bufp = bp->b_addr; - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; + + word = mp->m_blockwsize - 1; } } /* @@ -242,13 +267,13 @@ xfs_rtfind_back( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ want) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ want) & mask)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } else i = len; @@ -256,8 +281,7 @@ xfs_rtfind_back( /* * No match, return that we scanned the whole area. */ - xfs_trans_brelse(tp, bp); - *rtblock = start - i + 1; + *rtx = start - i + 1; return 0; } @@ -267,47 +291,44 @@ xfs_rtfind_back( */ int xfs_rtfind_forw( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to look at */ + xfs_rtxnum_t limit, /* last rtext to look at */ + xfs_rtxnum_t *rtx) /* out: start rtext found */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in the word */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t lastbit;/* last useful bit in the word */ + xfs_rtxnum_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute and read in starting bitmap block for starting block. */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + block = xfs_rtx_to_rbmblock(mp, start); + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Get the first word's index & point to it. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); len = limit - start + 1; /* * Compute match value, based on the bit at start: if 1 (free) * then all-ones, else all-zeroes. */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + incore = xfs_rtbitmap_getword(args, word); + want = (incore & ((xfs_rtword_t)1 << bit)) ? -1 : 0; /* * If the starting position is not word-aligned, deal with the * partial word. @@ -323,13 +344,12 @@ xfs_rtfind_forw( * Calculate the difference between the value there * and what we're looking for. */ - if ((wdiff = (*b ^ want) & mask)) { + if ((wdiff = (incore ^ want) & mask)) { /* * Different. Mark where we are and return. */ - xfs_trans_brelse(tp, bp); i = XFS_RTLOBIT(wdiff) - bit; - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } i = lastbit - bit; @@ -337,22 +357,15 @@ xfs_rtfind_forw( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the previous one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the previous word in the buffer. - */ - b++; } } else { /* @@ -368,13 +381,13 @@ xfs_rtfind_forw( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ want)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ want)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } i += XFS_NBWORD; @@ -382,22 +395,15 @@ xfs_rtfind_forw( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } /* @@ -412,13 +418,13 @@ xfs_rtfind_forw( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ want) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ want) & mask)) { /* * Different, mark where we are and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } else i = len; @@ -426,11 +432,25 @@ xfs_rtfind_forw( /* * No match, return that we scanned the whole area. */ - xfs_trans_brelse(tp, bp); - *rtblock = start + i - 1; + *rtx = start + i - 1; return 0; } +/* Log rtsummary counter at @infoword. */ +static inline void +xfs_trans_log_rtsummary( + struct xfs_rtalloc_args *args, + unsigned int infoword) +{ + struct xfs_buf *bp = args->sumbp; + size_t first, last; + + first = (void *)xfs_rsumblock_infoptr(args, infoword) - bp->b_addr; + last = first + sizeof(xfs_suminfo_t) - 1; + + xfs_trans_log_buf(args->tp, bp, first, last); +} + /* * Read and/or modify the summary information for a given extent size, * bitmap block combination. @@ -442,86 +462,77 @@ xfs_rtfind_forw( */ int xfs_rtmodify_summary_int( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { - struct xfs_buf *bp; /* buffer for the summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ + struct xfs_mount *mp = args->mp; + int error; + xfs_fileoff_t sb; /* summary fsblock */ + xfs_rtsumoff_t so; /* index into the summary file */ + unsigned int infoword; /* * Compute entry number in the summary file. */ - so = XFS_SUMOFFS(mp, log, bbno); + so = xfs_rtsumoffs(mp, log, bbno); /* * Compute the block number in the summary file. */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (*rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (*rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - *rbpp = bp; - *rsb = sb; - } + sb = xfs_rtsumoffs_to_block(mp, so); + + error = xfs_rtsummary_read_buf(args, sb); + if (error) + return error; + /* * Point to the summary information, modify/log it, and/or copy it out. */ - sp = XFS_SUMPTR(mp, bp, so); + infoword = xfs_rtsumoffs_to_infoword(mp, so); if (delta) { - uint first = (uint)((char *)sp - (char *)bp->b_addr); + xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta); - *sp += delta; if (mp->m_rsum_cache) { - if (*sp == 0 && log == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno]++; - if (*sp != 0 && log < mp->m_rsum_cache[bbno]) + if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) mp->m_rsum_cache[bbno] = log; + if (val != 0 && log >= mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; } - xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); + xfs_trans_log_rtsummary(args, infoword); + if (sum) + *sum = val; + } else if (sum) { + *sum = xfs_suminfo_get(args, infoword); } - if (sum) - *sum = *sp; return 0; } int xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int delta) /* in/out: summary block number */ { - return xfs_rtmodify_summary_int(mp, tp, log, bbno, - delta, rbpp, rsb, NULL); + return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL); +} + +/* Log rtbitmap block from the word @from to the byte before @next. */ +static inline void +xfs_trans_log_rtbitmap( + struct xfs_rtalloc_args *args, + unsigned int from, + unsigned int next) +{ + struct xfs_buf *bp = args->rbmbp; + size_t first, last; + + first = (void *)xfs_rbmblock_wordptr(args, from) - bp->b_addr; + last = ((void *)xfs_rbmblock_wordptr(args, next) - 1) - bp->b_addr; + + xfs_trans_log_buf(args->tp, bp, first, last); } /* @@ -530,41 +541,37 @@ xfs_rtmodify_summary( */ int xfs_rtmodify_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to modify */ - xfs_extlen_t len, /* length of extent to modify */ - int val) /* 1 for free, 0 for allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to modify */ + xfs_rtxlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtword_t *first; /* first used word in the buffer */ - int i; /* current bit number rel. to start */ - int lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask o frelevant bits for value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t incore; + unsigned int firstword; /* first word used in the buffer */ + unsigned int word; /* word number in the buffer */ /* * Compute starting bitmap block number. */ - block = XFS_BITTOBLOCK(mp, start); + block = xfs_rtx_to_rbmblock(mp, start); /* * Read the bitmap block, and point to its data. */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Compute the starting word's address, and starting bit. */ - word = XFS_BITTOWORD(mp, start); - first = b = &bufp[word]; + firstword = word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); /* * 0 (allocated) => all zeroes; 1 (free) => all ones. @@ -583,34 +590,28 @@ xfs_rtmodify_range( /* * Set/clear the active bits. */ + incore = xfs_rtbitmap_getword(args, word); if (val) - *b |= mask; + incore |= mask; else - *b &= ~mask; + incore &= ~mask; + xfs_rtbitmap_setword(args, word, incore); i = lastbit - bit; /* * Go on to the next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * Log the changed part of this block. * Get the next one. */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + xfs_trans_log_rtbitmap(args, firstword, word); + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; + + firstword = word = 0; } } else { /* @@ -626,31 +627,23 @@ xfs_rtmodify_range( /* * Set the word value correctly. */ - *b = val; + xfs_rtbitmap_setword(args, word, val); i += XFS_NBWORD; /* * Go on to the next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * Log the changed part of this block. * Get the next one. */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + xfs_trans_log_rtbitmap(args, firstword, word); + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - first = b = bufp = bp->b_addr; - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; + + firstword = word = 0; } } /* @@ -665,18 +658,19 @@ xfs_rtmodify_range( /* * Set/clear the active bits. */ + incore = xfs_rtbitmap_getword(args, word); if (val) - *b |= mask; + incore |= mask; else - *b &= ~mask; - b++; + incore &= ~mask; + xfs_rtbitmap_setword(args, word, incore); + word++; } /* * Log any remaining changed bytes. */ - if (b > first) - xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp - 1)); + if (word > firstword) + xfs_trans_log_rtbitmap(args, firstword, word); return 0; } @@ -686,23 +680,21 @@ xfs_rtmodify_range( */ int xfs_rtfree_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to free */ - xfs_extlen_t len, /* length to free */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext to free */ + xfs_rtxlen_t len) /* in/out: summary block number */ { - xfs_rtblock_t end; /* end of the freed extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block freed > end */ - xfs_rtblock_t preblock; /* first block freed < start */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtxnum_t postblock; /* first rtext freed > end */ + xfs_rtxnum_t preblock; /* first rtext freed < start */ end = start + len - 1; /* * Modify the bitmap to mark this extent freed. */ - error = xfs_rtmodify_range(mp, tp, start, len, 1); + error = xfs_rtmodify_range(args, start, len, 1); if (error) { return error; } @@ -711,15 +703,15 @@ xfs_rtfree_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + error = xfs_rtfind_back(args, start, 0, &preblock); if (error) { return error; } /* * Find the next allocated block (end of allocated extent). */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); + error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + &postblock); if (error) return error; /* @@ -727,9 +719,9 @@ xfs_rtfree_range( * old extent, add summary data for them to be allocated. */ if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(start - preblock), + xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; } @@ -739,9 +731,9 @@ xfs_rtfree_range( * old extent, add summary data for them to be allocated. */ if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock - end), + xfs_rtx_to_rbmblock(mp, end + 1), -1); if (error) { return error; } @@ -750,10 +742,9 @@ xfs_rtfree_range( * Increment the summary information corresponding to the entire * (new) free extent. */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); - return error; + return xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_rtx_to_rbmblock(mp, preblock), 1); } /* @@ -762,43 +753,39 @@ xfs_rtfree_range( */ int xfs_rtcheck_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int val, /* 1 for free, 0 for allocated */ - xfs_rtblock_t *new, /* out: first block not matching */ - int *stat) /* out: 1 for matches, 0 for not */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number of extent */ + xfs_rtxlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtxnum_t *new, /* out: first rtext not matching */ + int *stat) /* out: 1 for matches, 0 for not */ { - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - struct xfs_buf *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + struct xfs_mount *mp = args->mp; + int bit; /* bit number in the word */ + xfs_fileoff_t block; /* bitmap block number */ + int error; + xfs_rtxnum_t i; /* current bit number rel. to start */ + xfs_rtxnum_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + xfs_rtword_t incore; + unsigned int word; /* word number in the buffer */ /* * Compute starting bitmap block number */ - block = XFS_BITTOBLOCK(mp, start); + block = xfs_rtx_to_rbmblock(mp, start); /* * Read the bitmap block. */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, block); + if (error) return error; - } - bufp = bp->b_addr; + /* * Compute the starting word's address, and starting bit. */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; + word = xfs_rtx_to_rbmword(mp, start); bit = (int)(start & (XFS_NBWORD - 1)); /* * 0 (allocated) => all zero's; 1 (free) => all one's. @@ -820,11 +807,11 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ val) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ val) & mask)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); i = XFS_RTLOBIT(wdiff) - bit; *new = start + i; *stat = 0; @@ -835,22 +822,15 @@ xfs_rtcheck_range( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } else { /* @@ -866,11 +846,11 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = *b ^ val)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = incore ^ val)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); *new = start + i; *stat = 0; @@ -881,22 +861,15 @@ xfs_rtcheck_range( * Go on to next block if that's where the next word is * and we need the next word. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + if (++word == mp->m_blockwsize && i < len) { /* * If done with this block, get the next one. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { + error = xfs_rtbitmap_read_buf(args, ++block); + if (error) return error; - } - b = bufp = bp->b_addr; + word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; } } /* @@ -911,11 +884,11 @@ xfs_rtcheck_range( /* * Compute difference between actual and desired value. */ - if ((wdiff = (*b ^ val) & mask)) { + incore = xfs_rtbitmap_getword(args, word); + if ((wdiff = (incore ^ val) & mask)) { /* * Different, compute first wrong bit and return. */ - xfs_trans_brelse(tp, bp); i += XFS_RTLOBIT(wdiff); *new = start + i; *stat = 0; @@ -926,7 +899,6 @@ xfs_rtcheck_range( /* * Successful, return. */ - xfs_trans_brelse(tp, bp); *new = start + i; *stat = 1; return 0; @@ -936,57 +908,57 @@ xfs_rtcheck_range( /* * Check that the given extent (block range) is allocated already. */ -STATIC int /* error */ +STATIC int xfs_rtcheck_alloc_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number of extent */ + xfs_rtxlen_t len) /* length of extent */ { - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - int stat; - int error; + xfs_rtxnum_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; - error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + error = xfs_rtcheck_range(args, start, len, 0, &new, &stat); if (error) return error; ASSERT(stat); return 0; } #else -#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#define xfs_rtcheck_alloc_range(a,b,l) (0) #endif /* * Free an extent in the realtime subvolume. Length is expressed in * realtime extents, as is the block number. */ -int /* error */ +int xfs_rtfree_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len) /* length of extent freed */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtxnum_t start, /* starting rtext number to free */ + xfs_rtxlen_t len) /* length of extent freed */ { - int error; /* error value */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_fsblock_t sb; /* summary file block number */ - struct xfs_buf *sumbp = NULL; /* summary file block buffer */ - - mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; + int error; + struct timespec64 atime; ASSERT(mp->m_rbmip->i_itemp != NULL); ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + error = xfs_rtcheck_alloc_range(&args, start, len); if (error) return error; /* * Free the range of realtime blocks. */ - error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); - if (error) { - return error; - } + error = xfs_rtfree_range(&args, start, len); + if (error) + goto out; + /* * Mark more blocks free in the superblock. */ @@ -999,10 +971,49 @@ xfs_rtfree_extent( mp->m_sb.sb_rextents) { if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; + + atime = inode_get_atime(VFS_I(mp->m_rbmip)); + atime.tv_sec = 0; + inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } - return 0; + error = 0; +out: + xfs_rtbuf_cache_relse(&args); + return error; +} + +/* + * Free some blocks in the realtime subvolume. rtbno and rtlen are in units of + * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen + * cannot exceed XFS_MAX_BMBT_EXTLEN. + */ +int +xfs_rtfree_blocks( + struct xfs_trans *tp, + xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rtxnum_t start; + xfs_filblks_t len; + xfs_extlen_t mod; + + ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); + + len = xfs_rtb_to_rtxrem(mp, rtlen, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + start = xfs_rtb_to_rtxrem(mp, rtbno, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + return xfs_rtfree_extent(tp, start, len); } /* Find all the free records within a given range. */ @@ -1015,10 +1026,14 @@ xfs_rtalloc_query_range( xfs_rtalloc_query_range_fn fn, void *priv) { + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; struct xfs_rtalloc_rec rec; - xfs_rtblock_t rtstart; - xfs_rtblock_t rtend; - xfs_rtblock_t high_key; + xfs_rtxnum_t rtstart; + xfs_rtxnum_t rtend; + xfs_rtxnum_t high_key; int is_free; int error = 0; @@ -1034,13 +1049,13 @@ xfs_rtalloc_query_range( rtstart = low_rec->ar_startext; while (rtstart <= high_key) { /* Is the first block free? */ - error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, + error = xfs_rtcheck_range(&args, rtstart, 1, 1, &rtend, &is_free); if (error) break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend); + error = xfs_rtfind_forw(&args, rtstart, high_key, &rtend); if (error) break; @@ -1056,6 +1071,7 @@ xfs_rtalloc_query_range( rtstart = rtend + 1; } + xfs_rtbuf_cache_relse(&args); return error; } @@ -1081,18 +1097,79 @@ int xfs_rtalloc_extent_is_free( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, - xfs_extlen_t len, + xfs_rtxnum_t start, + xfs_rtxlen_t len, bool *is_free) { - xfs_rtblock_t end; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = tp, + }; + xfs_rtxnum_t end; int matches; int error; - error = xfs_rtcheck_range(mp, tp, start, len, 1, &end, &matches); + error = xfs_rtcheck_range(&args, start, len, 1, &end, &matches); + xfs_rtbuf_cache_relse(&args); if (error) return error; *is_free = matches; return 0; } + +/* + * Compute the number of rtbitmap blocks needed to track the given number of rt + * extents. + */ +xfs_filblks_t +xfs_rtbitmap_blockcount( + struct xfs_mount *mp, + xfs_rtbxlen_t rtextents) +{ + return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); +} + +/* + * Compute the number of rtbitmap words needed to populate every block of a + * bitmap that is large enough to track the given number of rt extents. + */ +unsigned long long +xfs_rtbitmap_wordcount( + struct xfs_mount *mp, + xfs_rtbxlen_t rtextents) +{ + xfs_filblks_t blocks; + + blocks = xfs_rtbitmap_blockcount(mp, rtextents); + return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; +} + +/* Compute the number of rtsummary blocks needed to track the given rt space. */ +xfs_filblks_t +xfs_rtsummary_blockcount( + struct xfs_mount *mp, + unsigned int rsumlevels, + xfs_extlen_t rbmblocks) +{ + unsigned long long rsumwords; + + rsumwords = (unsigned long long)rsumlevels * rbmblocks; + return XFS_B_TO_FSB(mp, rsumwords << XFS_WORDLOG); +} + +/* + * Compute the number of rtsummary info words needed to populate every block of + * a summary file that is large enough to track the given rt space. + */ +unsigned long long +xfs_rtsummary_wordcount( + struct xfs_mount *mp, + unsigned int rsumlevels, + xfs_extlen_t rbmblocks) +{ + xfs_filblks_t blocks; + + blocks = xfs_rtsummary_blockcount(mp, rsumlevels, rbmblocks); + return XFS_FSB_TO_B(mp, blocks) >> XFS_WORDLOG; +} diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h new file mode 100644 index 000000000000..c0637057d69c --- /dev/null +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_RTBITMAP_H__ +#define __XFS_RTBITMAP_H__ + +struct xfs_rtalloc_args { + struct xfs_mount *mp; + struct xfs_trans *tp; + + struct xfs_buf *rbmbp; /* bitmap block buffer */ + struct xfs_buf *sumbp; /* summary block buffer */ + + xfs_fileoff_t rbmoff; /* bitmap block number */ + xfs_fileoff_t sumoff; /* summary block number */ +}; + +static inline xfs_rtblock_t +xfs_rtx_to_rtb( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + if (mp->m_rtxblklog >= 0) + return rtx << mp->m_rtxblklog; + + return rtx * mp->m_sb.sb_rextsize; +} + +static inline xfs_extlen_t +xfs_rtxlen_to_extlen( + struct xfs_mount *mp, + xfs_rtxlen_t rtxlen) +{ + if (mp->m_rtxblklog >= 0) + return rtxlen << mp->m_rtxblklog; + + return rtxlen * mp->m_sb.sb_rextsize; +} + +/* Compute the misalignment between an extent length and a realtime extent .*/ +static inline unsigned int +xfs_extlen_to_rtxmod( + struct xfs_mount *mp, + xfs_extlen_t len) +{ + if (mp->m_rtxblklog >= 0) + return len & mp->m_rtxblkmask; + + return len % mp->m_sb.sb_rextsize; +} + +static inline xfs_rtxlen_t +xfs_extlen_to_rtxlen( + struct xfs_mount *mp, + xfs_extlen_t len) +{ + if (mp->m_rtxblklog >= 0) + return len >> mp->m_rtxblklog; + + return len / mp->m_sb.sb_rextsize; +} + +/* Convert an rt block number into an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rtbno >> mp->m_rtxblklog; + + return div_u64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Return the offset of an rt block number within an rt extent. */ +static inline xfs_extlen_t +xfs_rtb_to_rtxoff( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) + return rtbno & mp->m_rtxblkmask; + + return do_div(rtbno, mp->m_sb.sb_rextsize); +} + +/* + * Crack an rt block number into an rt extent number and an offset within that + * rt extent. Returns the rt extent number directly and the offset in @off. + */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtxrem( + struct xfs_mount *mp, + xfs_rtblock_t rtbno, + xfs_extlen_t *off) +{ + if (likely(mp->m_rtxblklog >= 0)) { + *off = rtbno & mp->m_rtxblkmask; + return rtbno >> mp->m_rtxblklog; + } + + return div_u64_rem(rtbno, mp->m_sb.sb_rextsize, off); +} + +/* + * Convert an rt block number into an rt extent number, rounding up to the next + * rt extent if the rt block is not aligned to an rt extent boundary. + */ +static inline xfs_rtxnum_t +xfs_rtb_to_rtxup( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + if (likely(mp->m_rtxblklog >= 0)) { + if (rtbno & mp->m_rtxblkmask) + return (rtbno >> mp->m_rtxblklog) + 1; + return rtbno >> mp->m_rtxblklog; + } + + if (do_div(rtbno, mp->m_sb.sb_rextsize)) + rtbno++; + return rtbno; +} + +/* Round this rtblock up to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_rtb_roundup_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return roundup_64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Round this rtblock down to the nearest rt extent size. */ +static inline xfs_rtblock_t +xfs_rtb_rounddown_rtx( + struct xfs_mount *mp, + xfs_rtblock_t rtbno) +{ + return rounddown_64(rtbno, mp->m_sb.sb_rextsize); +} + +/* Convert an rt extent number to a file block offset in the rt bitmap file. */ +static inline xfs_fileoff_t +xfs_rtx_to_rbmblock( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return rtx >> mp->m_blkbit_log; +} + +/* Convert an rt extent number to a word offset within an rt bitmap block. */ +static inline unsigned int +xfs_rtx_to_rbmword( + struct xfs_mount *mp, + xfs_rtxnum_t rtx) +{ + return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); +} + +/* Convert a file block offset in the rt bitmap file to an rt extent number. */ +static inline xfs_rtxnum_t +xfs_rbmblock_to_rtx( + struct xfs_mount *mp, + xfs_fileoff_t rbmoff) +{ + return rbmoff << mp->m_blkbit_log; +} + +/* Return a pointer to a bitmap word within a rt bitmap block. */ +static inline union xfs_rtword_raw * +xfs_rbmblock_wordptr( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_rtword_raw *words = args->rbmbp->b_addr; + + return words + index; +} + +/* Convert an ondisk bitmap word to its incore representation. */ +static inline xfs_rtword_t +xfs_rtbitmap_getword( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + + return word->old; +} + +/* Set an ondisk bitmap word from an incore representation. */ +static inline void +xfs_rtbitmap_setword( + struct xfs_rtalloc_args *args, + unsigned int index, + xfs_rtword_t value) +{ + union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); + + word->old = value; +} + +/* + * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t + * offset within the rt summary file. + */ +static inline xfs_rtsumoff_t +xfs_rtsumoffs( + struct xfs_mount *mp, + int log2_len, + xfs_fileoff_t rbmoff) +{ + return log2_len * mp->m_sb.sb_rbmblocks + rbmoff; +} + +/* + * Convert an xfs_suminfo_t offset to a file block offset within the rt summary + * file. + */ +static inline xfs_fileoff_t +xfs_rtsumoffs_to_block( + struct xfs_mount *mp, + xfs_rtsumoff_t rsumoff) +{ + return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); +} + +/* + * Convert an xfs_suminfo_t offset to an info word offset within an rt summary + * block. + */ +static inline unsigned int +xfs_rtsumoffs_to_infoword( + struct xfs_mount *mp, + xfs_rtsumoff_t rsumoff) +{ + unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; + + return rsumoff & mask; +} + +/* Return a pointer to a summary info word within a rt summary block. */ +static inline union xfs_suminfo_raw * +xfs_rsumblock_infoptr( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_suminfo_raw *info = args->sumbp->b_addr; + + return info + index; +} + +/* Get the current value of a summary counter. */ +static inline xfs_suminfo_t +xfs_suminfo_get( + struct xfs_rtalloc_args *args, + unsigned int index) +{ + union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + + return info->old; +} + +/* Add to the current value of a summary counter and return the new value. */ +static inline xfs_suminfo_t +xfs_suminfo_add( + struct xfs_rtalloc_args *args, + unsigned int index, + int delta) +{ + union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); + + info->old += delta; + return info->old; +} + +/* + * Functions for walking free space rtextents in the realtime bitmap. + */ +struct xfs_rtalloc_rec { + xfs_rtxnum_t ar_startext; + xfs_rtbxlen_t ar_extcount; +}; + +typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv); + +#ifdef CONFIG_XFS_RT +void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); + +int xfs_rtbuf_get(struct xfs_rtalloc_args *args, xfs_fileoff_t block, + int issum); + +static inline int +xfs_rtbitmap_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + return xfs_rtbuf_get(args, block, 0); +} + +static inline int +xfs_rtsummary_read_buf( + struct xfs_rtalloc_args *args, + xfs_fileoff_t block) +{ + return xfs_rtbuf_get(args, block, 1); +} + +int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); +int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); +int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); +int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len, int val); +int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum); +int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, int delta); +int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, + xfs_rtxlen_t len); +int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, void *priv); +int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv); +int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtxnum_t start, xfs_rtxlen_t len, + bool *is_free); +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_rtxnum_t start, /* starting rtext number to free */ + xfs_rtxlen_t len); /* length of extent freed */ + +/* Same as above, but in units of rt blocks. */ +int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen); + +xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t + rtextents); +unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, + xfs_rtbxlen_t rtextents); + +xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, + unsigned int rsumlevels, xfs_extlen_t rbmblocks); +unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, + unsigned int rsumlevels, xfs_extlen_t rbmblocks); +#else /* CONFIG_XFS_RT */ +# define xfs_rtfree_extent(t,b,l) (-ENOSYS) +# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) +# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) +# define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) +# define xfs_rtsummary_read_buf(a,b) (-ENOSYS) +# define xfs_rtbuf_cache_relse(a) (0) +# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +static inline xfs_filblks_t +xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) +{ + /* shut up gcc */ + return 0; +} +# define xfs_rtbitmap_wordcount(mp, r) (0) +# define xfs_rtsummary_blockcount(mp, l, b) (0) +# define xfs_rtsummary_wordcount(mp, l, b) (0) +#endif /* CONFIG_XFS_RT */ + +#endif /* __XFS_RTBITMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5e174685a77c..1f74d0cd1618 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -266,7 +266,8 @@ xfs_validate_sb_write( return -EFSCORRUPTED; } - if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + if (!xfs_is_readonly(mp) && + xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Corruption detected in superblock read-only compatible features (0x%x)!", (sbp->sb_features_ro_compat & @@ -974,6 +975,8 @@ xfs_sb_mount_common( mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; + mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index a5e14740ec9a..19134b23c10b 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -25,7 +25,7 @@ extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); -#define XFS_FS_GEOM_MAX_STRUCT_VER (4) +#define XFS_FS_GEOM_MAX_STRUCT_VER (5) extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index cb4796b6e693..70e97ea6eee7 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -65,9 +65,9 @@ xfs_trans_ichgtime( tv = current_time(inode); if (flags & XFS_ICHGTIME_MOD) - inode->i_mtime = tv; + inode_set_mtime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CHG) - inode->i_ctime = tv; + inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5b2f27cbdb80..6cd45e8c118d 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -19,6 +19,7 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_trans_space.h" +#include "xfs_rtbitmap.h" #define _ALLOC true #define _FREE false @@ -217,11 +218,12 @@ xfs_rtalloc_block_count( struct xfs_mount *mp, unsigned int num_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); - unsigned int rtbmp_bytes; + unsigned int rtbmp_blocks; + xfs_rtxlen_t rtxlen; - rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY; - return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; + rtxlen = xfs_extlen_to_rtxlen(mp, XFS_MAX_BMBT_EXTLEN); + rtbmp_blocks = xfs_rtbitmap_blockcount(mp, rtxlen); + return (rtbmp_blocks + 1) * num_ops; } /* diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 5c2765934732..c299b16c9365 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -148,10 +148,10 @@ xfs_verify_rtbno( /* Verify that a realtime device extent is fully contained inside the volume. */ bool -xfs_verify_rtext( +xfs_verify_rtbext( struct xfs_mount *mp, xfs_rtblock_t rtbno, - xfs_rtblock_t len) + xfs_filblks_t len) { if (rtbno + len <= rtbno) return false; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 851220021484..533200c4ccc2 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -11,6 +11,7 @@ typedef uint32_t prid_t; /* project ID */ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef uint32_t xfs_extlen_t; /* extent length in blocks */ +typedef uint32_t xfs_rtxlen_t; /* file extent length in rtextents */ typedef uint32_t xfs_agnumber_t; /* allocation group number */ typedef uint64_t xfs_extnum_t; /* # of extents in a file */ typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */ @@ -18,6 +19,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ +typedef uint32_t xfs_rtsumoff_t; /* offset of an rtsummary info word */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ @@ -31,6 +33,8 @@ typedef uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ typedef uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ typedef uint64_t xfs_fileoff_t; /* block number in a file */ typedef uint64_t xfs_filblks_t; /* number of blocks in a file */ +typedef uint64_t xfs_rtxnum_t; /* rtextent number */ +typedef uint64_t xfs_rtbxlen_t; /* rtbitmap extent length in rtextents */ typedef int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */ @@ -47,6 +51,7 @@ typedef void * xfs_failaddr_t; #define NULLRFSBLOCK ((xfs_rfsblock_t)-1) #define NULLRTBLOCK ((xfs_rtblock_t)-1) #define NULLFILEOFF ((xfs_fileoff_t)-1) +#define NULLRTEXTNO ((xfs_rtxnum_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) @@ -145,6 +150,7 @@ typedef uint32_t xfs_dqid_t; */ #define XFS_NBBYLOG 3 /* log2(NBBY) */ #define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_SUMINFOLOG 2 /* log2(sizeof(xfs_suminfo_t)) */ #define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) #define XFS_NBWORD (1 << XFS_NBWORDLOG) #define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) @@ -229,8 +235,8 @@ bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); -bool xfs_verify_rtext(struct xfs_mount *mp, xfs_rtblock_t rtbno, - xfs_rtblock_t len); +bool xfs_verify_rtbext(struct xfs_mount *mp, xfs_rtblock_t rtbno, + xfs_filblks_t len); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min, diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index bbaa65422c4f..876a2f41b063 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/reap.h" /* Superblock */ @@ -48,6 +49,10 @@ xrep_superblock( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Copy AG 0's superblock to this one. */ xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); @@ -423,6 +428,10 @@ xrep_agf( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Start rewriting the header and implant the btrees we found. */ xrep_agf_init_header(sc, agf_bp, &old_agf); xrep_agf_set_roots(sc, agf, fab); @@ -444,13 +453,13 @@ out_revert: struct xrep_agfl { /* Bitmap of alleged AGFL blocks that we're not going to add. */ - struct xbitmap crossed; + struct xagb_bitmap crossed; /* Bitmap of other OWN_AG metadata blocks. */ - struct xbitmap agmetablocks; + struct xagb_bitmap agmetablocks; /* Bitmap of free space. */ - struct xbitmap *freesp; + struct xagb_bitmap *freesp; /* rmapbt cursor for finding crosslinked blocks */ struct xfs_btree_cur *rmap_cur; @@ -466,7 +475,6 @@ xrep_agfl_walk_rmap( void *priv) { struct xrep_agfl *ra = priv; - xfs_fsblock_t fsb; int error = 0; if (xchk_should_terminate(ra->sc, &error)) @@ -474,14 +482,13 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, - rec->rm_startblock); - error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); + error = xagb_bitmap_set(ra->freesp, rec->rm_startblock, + rec->rm_blockcount); if (error) return error; } - return xbitmap_set_btcur_path(&ra->agmetablocks, cur); + return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur); } /* Strike out the blocks that are cross-linked according to the rmapbt. */ @@ -492,12 +499,10 @@ xrep_agfl_check_extent( void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start); + xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; - ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno); - while (agbno <= last_agbno) { bool other_owners; @@ -507,7 +512,7 @@ xrep_agfl_check_extent( return error; if (other_owners) { - error = xbitmap_set(&ra->crossed, agbno, 1); + error = xagb_bitmap_set(&ra->crossed, agbno, 1); if (error) return error; } @@ -533,7 +538,7 @@ STATIC int xrep_agfl_collect_blocks( struct xfs_scrub *sc, struct xfs_buf *agf_bp, - struct xbitmap *agfl_extents, + struct xagb_bitmap *agfl_extents, xfs_agblock_t *flcount) { struct xrep_agfl ra; @@ -543,8 +548,8 @@ xrep_agfl_collect_blocks( ra.sc = sc; ra.freesp = agfl_extents; - xbitmap_init(&ra.agmetablocks); - xbitmap_init(&ra.crossed); + xagb_bitmap_init(&ra.agmetablocks); + xagb_bitmap_init(&ra.crossed); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); @@ -556,7 +561,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_BNO); - error = xbitmap_set_btblocks(&ra.agmetablocks, cur); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; @@ -564,7 +569,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag, XFS_BTNUM_CNT); - error = xbitmap_set_btblocks(&ra.agmetablocks, cur); + error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; @@ -573,17 +578,17 @@ xrep_agfl_collect_blocks( * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ - error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); + error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks); if (error) goto out_bmp; /* Strike out the blocks that are cross-linked. */ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); - error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); + error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra); xfs_btree_del_cursor(ra.rmap_cur, error); if (error) goto out_bmp; - error = xbitmap_disunion(agfl_extents, &ra.crossed); + error = xagb_bitmap_disunion(agfl_extents, &ra.crossed); if (error) goto out_bmp; @@ -591,12 +596,12 @@ xrep_agfl_collect_blocks( * Calculate the new AGFL size. If we found more blocks than fit in * the AGFL we'll free them later. */ - *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), + *flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents), xfs_agfl_size(mp)); out_bmp: - xbitmap_destroy(&ra.crossed); - xbitmap_destroy(&ra.agmetablocks); + xagb_bitmap_destroy(&ra.crossed); + xagb_bitmap_destroy(&ra.agmetablocks); return error; } @@ -615,18 +620,24 @@ xrep_agfl_update_agf( xfs_force_summary_recalc(sc->mp); /* Update the AGF counters. */ - if (xfs_perag_initialised_agf(sc->sa.pag)) + if (xfs_perag_initialised_agf(sc->sa.pag)) { sc->sa.pag->pagf_flcount = flcount; + clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, + &sc->sa.pag->pag_opstate); + } agf->agf_flfirst = cpu_to_be32(0); agf->agf_flcount = cpu_to_be32(flcount); - agf->agf_fllast = cpu_to_be32(flcount - 1); + if (flcount) + agf->agf_fllast = cpu_to_be32(flcount - 1); + else + agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1); xfs_alloc_log_agf(sc->tp, agf_bp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); } struct xrep_agfl_fill { - struct xbitmap used_extents; + struct xagb_bitmap used_extents; struct xfs_scrub *sc; __be32 *agfl_bno; xfs_agblock_t flcount; @@ -642,17 +653,15 @@ xrep_agfl_fill( { struct xrep_agfl_fill *af = priv; struct xfs_scrub *sc = af->sc; - xfs_fsblock_t fsbno = start; + xfs_agblock_t agbno = start; int error; - while (fsbno < start + len && af->fl_off < af->flcount) - af->agfl_bno[af->fl_off++] = - cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++)); + trace_xrep_agfl_insert(sc->sa.pag, agbno, len); - trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno, - XFS_FSB_TO_AGBNO(sc->mp, start), len); + while (agbno < start + len && af->fl_off < af->flcount) + af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++); - error = xbitmap_set(&af->used_extents, start, fsbno - 1); + error = xagb_bitmap_set(&af->used_extents, start, agbno - 1); if (error) return error; @@ -667,7 +676,7 @@ STATIC int xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, - struct xbitmap *agfl_extents, + struct xagb_bitmap *agfl_extents, xfs_agblock_t flcount) { struct xrep_agfl_fill af = { @@ -695,17 +704,17 @@ xrep_agfl_init_header( * blocks than fit in the AGFL, they will be freed in a subsequent * step. */ - xbitmap_init(&af.used_extents); + xagb_bitmap_init(&af.used_extents); af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp), - xbitmap_walk(agfl_extents, xrep_agfl_fill, &af); - error = xbitmap_disunion(agfl_extents, &af.used_extents); + xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af); + error = xagb_bitmap_disunion(agfl_extents, &af.used_extents); if (error) return error; /* Write new AGFL to disk. */ xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF); xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1); - xbitmap_destroy(&af.used_extents); + xagb_bitmap_destroy(&af.used_extents); return 0; } @@ -714,7 +723,7 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xbitmap agfl_extents; + struct xagb_bitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; struct xfs_buf *agfl_bp; @@ -725,7 +734,7 @@ xrep_agfl( if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xbitmap_init(&agfl_extents); + xagb_bitmap_init(&agfl_extents); /* * Read the AGF so that we can query the rmapbt. We hope that there's @@ -753,6 +762,10 @@ xrep_agfl( if (error) goto err; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err; + /* * Update AGF and AGFL. We reset the global free block counter when * we adjust the AGF flcount (which can fail) so avoid updating any @@ -774,10 +787,10 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); err: - xbitmap_destroy(&agfl_extents); + xagb_bitmap_destroy(&agfl_extents); return error; } @@ -1000,6 +1013,10 @@ xrep_agi( if (error) return error; + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + /* Start rewriting the header and implant the btrees we found. */ xrep_agi_init_header(sc, agi_bp, &old_agi); xrep_agi_set_roots(sc, agi, fab); diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 0c959be396ea..e0c89a9a0ca0 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -301,21 +301,15 @@ xagb_bitmap_set_btblocks( * blocks going from the leaf towards the root. */ int -xbitmap_set_btcur_path( - struct xbitmap *bitmap, +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur) { - struct xfs_buf *bp; - xfs_fsblock_t fsb; int i; int error; for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - xfs_btree_get_block(cur, i, &bp); - if (!bp) - continue; - fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - error = xbitmap_set(bitmap, fsb, 1); + error = xagb_bitmap_visit_btblock(cur, i, bitmap); if (error) return error; } @@ -323,35 +317,6 @@ xbitmap_set_btcur_path( return 0; } -/* Collect a btree's block in the bitmap. */ -STATIC int -xbitmap_collect_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) -{ - struct xbitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; - - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - return xbitmap_set(bitmap, fsbno, 1); -} - -/* Walk the btree and mark the bitmap wherever a btree block is found. */ -int -xbitmap_set_btblocks( - struct xbitmap *bitmap, - struct xfs_btree_cur *cur) -{ - return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock, - XFS_BTREE_VISIT_ALL, bitmap); -} - /* How many bits are set in this bitmap? */ uint64_t xbitmap_hweight( @@ -385,43 +350,6 @@ xbitmap_walk( return error; } -struct xbitmap_walk_bits { - xbitmap_walk_bits_fn fn; - void *priv; -}; - -/* Walk all the bits in a run. */ -static int -xbitmap_walk_bits_in_run( - uint64_t start, - uint64_t len, - void *priv) -{ - struct xbitmap_walk_bits *wb = priv; - uint64_t i; - int error = 0; - - for (i = start; i < start + len; i++) { - error = wb->fn(i, wb->priv); - if (error) - break; - } - - return error; -} - -/* Call a function for every set bit in this bitmap. */ -int -xbitmap_walk_bits( - struct xbitmap *bitmap, - xbitmap_walk_bits_fn fn, - void *priv) -{ - struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv}; - - return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb); -} - /* Does this bitmap have no bits set at all? */ bool xbitmap_empty( diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 84981724ecaf..4fe58bad6734 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -16,10 +16,6 @@ void xbitmap_destroy(struct xbitmap *bitmap); int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -int xbitmap_set_btcur_path(struct xbitmap *bitmap, - struct xfs_btree_cur *cur); -int xbitmap_set_btblocks(struct xbitmap *bitmap, - struct xfs_btree_cur *cur); uint64_t xbitmap_hweight(struct xbitmap *bitmap); /* @@ -33,10 +29,6 @@ typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, void *priv); -typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv); -int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn, - void *priv); - bool xbitmap_empty(struct xbitmap *bitmap); bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); @@ -110,5 +102,7 @@ static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 5bf4326e9783..06d8c1996a33 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -38,8 +38,7 @@ xchk_setup_inode_bmap( if (error) goto out; - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, XFS_IOLOCK_EXCL); + xchk_ilock(sc, XFS_IOLOCK_EXCL); /* * We don't want any ephemeral data/cow fork updates sitting around @@ -50,8 +49,7 @@ xchk_setup_inode_bmap( sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; - sc->ilock_flags |= XFS_MMAPLOCK_EXCL; - xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL); + xchk_ilock(sc, XFS_MMAPLOCK_EXCL); inode_dio_wait(VFS_I(sc->ip)); @@ -79,9 +77,8 @@ xchk_setup_inode_bmap( error = xchk_trans_alloc(sc, 0); if (error) goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ return error; @@ -413,7 +410,7 @@ xchk_bmap_iextent( /* Make sure the extent points to a valid place. */ if (info->is_rt && - !xfs_verify_rtext(mp, irec->br_startblock, irec->br_blockcount)) + !xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); if (!info->is_rt && @@ -844,7 +841,7 @@ xchk_bmap( /* Non-existent forks can be ignored. */ if (!ifp) - goto out; + return -ENOENT; info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); info.whichfork = whichfork; @@ -853,10 +850,10 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink inodes/filesystems. */ - if (!xfs_is_reflink_inode(ip)) { + /* No CoW forks on non-reflink filesystems. */ + if (!xfs_has_reflink(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); - goto out; + return 0; } break; case XFS_ATTR_FORK: @@ -876,31 +873,31 @@ xchk_bmap( /* No mappings to check. */ if (whichfork == XFS_COW_FORK) xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; case XFS_DINODE_FMT_EXTENTS: break; case XFS_DINODE_FMT_BTREE: if (whichfork == XFS_COW_FORK) { xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; } error = xchk_bmap_btree(sc, whichfork, &info); if (error) - goto out; + return error; break; default: xchk_fblock_set_corrupt(sc, whichfork, 0); - goto out; + return 0; } if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + return 0; /* Find the offset of the last extent in the mapping. */ error = xfs_bmap_last_offset(ip, &endoff, whichfork); if (!xchk_fblock_process_error(sc, whichfork, 0, &error)) - goto out; + return error; /* * Scrub extent records. We use a special iterator function here that @@ -913,12 +910,12 @@ xchk_bmap( while (xchk_bmap_iext_iter(&info, &irec)) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out; + return 0; if (irec.br_startoff >= endoff) { xchk_fblock_set_corrupt(sc, whichfork, irec.br_startoff); - goto out; + return 0; } if (isnullstartblock(irec.br_startblock)) @@ -931,10 +928,10 @@ xchk_bmap( if (xchk_bmap_want_check_rmaps(&info)) { error = xchk_bmap_check_rmaps(sc, whichfork); if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) - goto out; + return error; } -out: - return error; + + return 0; } /* Scrub an inode's data fork. */ @@ -958,8 +955,5 @@ int xchk_bmap_cow( struct xfs_scrub *sc) { - if (!xfs_is_reflink_inode(sc->ip)) - return -ENOENT; - return xchk_bmap(sc, XFS_COW_FORK); } diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 7a20256be969..de24532fe083 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -832,6 +832,25 @@ xchk_install_handle_inode( } /* + * Install an already-referenced inode for scrubbing. Get our own reference to + * the inode to make disposal simpler. The inode must not be in I_FREEING or + * I_WILL_FREE state! + */ +int +xchk_install_live_inode( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + if (!igrab(VFS_I(ip))) { + xchk_ino_set_corrupt(sc, ip->i_ino); + return -EFSCORRUPTED; + } + + sc->ip = ip; + return 0; +} + +/* * In preparation to scrub metadata structures that hang off of an inode, * grab either the inode referenced in the scrub control structure or the * inode passed in. If the inumber does not reference an allocated inode @@ -854,10 +873,8 @@ xchk_iget_for_scrubbing( ASSERT(sc->tp == NULL); /* We want to scan the inode we already had opened. */ - if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { - sc->ip = ip_in; - return 0; - } + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) + return xchk_install_live_inode(sc, ip_in); /* Reject internal metadata files and obviously bad inode numbers. */ if (xfs_internal_inum(mp, sc->sm->sm_ino)) @@ -1005,20 +1022,48 @@ xchk_setup_inode_contents( return error; /* Lock the inode so the VFS cannot touch this file. */ - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_IOLOCK_EXCL); error = xchk_trans_alloc(sc, resblks); if (error) goto out; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); - + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ return error; } +void +xchk_ilock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + xfs_ilock(sc->ip, ilock_flags); + sc->ilock_flags |= ilock_flags; +} + +bool +xchk_ilock_nowait( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + if (xfs_ilock_nowait(sc->ip, ilock_flags)) { + sc->ilock_flags |= ilock_flags; + return true; + } + + return false; +} + +void +xchk_iunlock( + struct xfs_scrub *sc, + unsigned int ilock_flags) +{ + sc->ilock_flags &= ~ilock_flags; + xfs_iunlock(sc->ip, ilock_flags); +} + /* * Predicate that decides if we need to evaluate the cross-reference check. * If there was an error accessing the cross-reference btree, just delete @@ -1185,3 +1230,155 @@ xchk_fsgates_enable( sc->flags |= scrub_fsgates; } + +/* + * Decide if this is this a cached inode that's also allocated. The caller + * must hold a reference to an AG and the AGI buffer lock to prevent inodes + * from being allocated or freed. + * + * Look up an inode by number in the given file system. If the inode number + * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA. + * If the inode is being reclaimed, return -ENODATA because we know the inode + * cache cannot be updating the ondisk metadata. + * + * Otherwise, the incore inode is the one we want, and it is either live, + * somewhere in the inactivation machinery, or reclaimable. The inode is + * allocated if i_mode is nonzero. In all three cases, the cached inode will + * be more up to date than the ondisk inode buffer, so we must use the incore + * i_mode. + */ +int +xchk_inode_is_allocated( + struct xfs_scrub *sc, + xfs_agino_t agino, + bool *inuse) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag = sc->sa.pag; + xfs_ino_t ino; + struct xfs_inode *ip; + int error; + + /* caller must hold perag reference */ + if (pag == NULL) { + ASSERT(pag != NULL); + return -EINVAL; + } + + /* caller must have AGI buffer */ + if (sc->sa.agi_bp == NULL) { + ASSERT(sc->sa.agi_bp != NULL); + return -EINVAL; + } + + /* reject inode numbers outside existing AGs */ + ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino); + if (!xfs_verify_ino(mp, ino)) + return -EINVAL; + + error = -ENODATA; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* cache miss */ + goto out_rcu; + } + + /* + * If the inode number doesn't match, the incore inode got reused + * during an RCU grace period and the radix tree hasn't been updated. + * This isn't the inode we want. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) + goto out_skip; + + trace_xchk_inode_is_allocated(ip); + + /* + * We have an incore inode that matches the inode we want, and the + * caller holds the perag structure and the AGI buffer. Let's check + * our assumptions below: + */ + +#ifdef DEBUG + /* + * (1) If the incore inode is live (i.e. referenced from the dcache), + * it will not be INEW, nor will it be in the inactivation or reclaim + * machinery. The ondisk inode had better be allocated. This is the + * most trivial case. + */ + if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE | + XFS_INACTIVATING))) { + /* live inode */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * If the incore inode is INEW, there are several possibilities: + * + * (2) For a file that is being created, note that we allocate the + * ondisk inode before allocating, initializing, and adding the incore + * inode to the radix tree. + * + * (3) If the incore inode is being recycled, the inode has to be + * allocated because we don't allow freed inodes to be recycled. + * Recycling doesn't touch i_mode. + */ + if (ip->i_flags & XFS_INEW) { + /* created on disk already or recycling */ + ASSERT(VFS_I(ip)->i_mode != 0); + } + + /* + * (4) If the inode is queued for inactivation (NEED_INACTIVE) but + * inactivation has not started (!INACTIVATING), it is still allocated. + */ + if ((ip->i_flags & XFS_NEED_INACTIVE) && + !(ip->i_flags & XFS_INACTIVATING)) { + /* definitely before difree */ + ASSERT(VFS_I(ip)->i_mode != 0); + } +#endif + + /* + * If the incore inode is undergoing inactivation (INACTIVATING), there + * are two possibilities: + * + * (5) It is before the point where it would get freed ondisk, in which + * case i_mode is still nonzero. + * + * (6) It has already been freed, in which case i_mode is zero. + * + * We don't take the ILOCK here, but difree and dialloc update the AGI, + * and we've taken the AGI buffer lock, which prevents that from + * happening. + */ + + /* + * (7) Inodes undergoing inactivation (INACTIVATING) or queued for + * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still + * reflects the ondisk state. + */ + + /* + * (8) If the inode is in IFLUSHING, it's safe to query i_mode because + * the flush code uses i_mode to format the ondisk inode. + */ + + /* + * (9) If the inode is in IRECLAIM and was reachable via the radix + * tree, it still has the same i_mode as it did before it entered + * reclaim. The inode object is still alive because we hold the RCU + * read lock. + */ + + *inuse = VFS_I(ip)->i_mode != 0; + error = 0; + +out_skip: + spin_unlock(&ip->i_flags_lock); +out_rcu: + rcu_read_unlock(); + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 791235cd9b00..cabdc0e16838 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -88,10 +88,16 @@ int xchk_setup_xattr(struct xfs_scrub *sc); int xchk_setup_symlink(struct xfs_scrub *sc); int xchk_setup_parent(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT -int xchk_setup_rt(struct xfs_scrub *sc); +int xchk_setup_rtbitmap(struct xfs_scrub *sc); +int xchk_setup_rtsummary(struct xfs_scrub *sc); #else static inline int -xchk_setup_rt(struct xfs_scrub *sc) +xchk_setup_rtbitmap(struct xfs_scrub *sc) +{ + return -ENOENT; +} +static inline int +xchk_setup_rtsummary(struct xfs_scrub *sc) { return -ENOENT; } @@ -137,6 +143,12 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log); int xchk_iget_for_scrubbing(struct xfs_scrub *sc); int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks); +int xchk_install_live_inode(struct xfs_scrub *sc, struct xfs_inode *ip); + +void xchk_ilock(struct xfs_scrub *sc, unsigned int ilock_flags); +bool xchk_ilock_nowait(struct xfs_scrub *sc, unsigned int ilock_flags); +void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); + void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); @@ -155,9 +167,29 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +#ifdef CONFIG_XFS_ONLINE_REPAIR +/* Decide if a repair is required. */ +static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN); +} +#else +# define xchk_needs_repair(sc) (false) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + int xchk_metadata_inode_forks(struct xfs_scrub *sc); /* + * Helper macros to allocate and format xfile description strings. + * Callers must kfree the pointer returned. + */ +#define xchk_xfile_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ + (sc)->mp->m_super->s_id, ##__VA_ARGS__) + +/* * Setting up a hook to wait for intents to drain is costly -- we have to take * the CPU hotplug lock and force an i-cache flush on all CPUs once to set it * up, and again to tear it down. These costs add up quickly, so we only want @@ -171,4 +203,7 @@ static inline bool xchk_need_intent_drain(struct xfs_scrub *sc) void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks); +int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino, + bool *inuse); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index e382a35e98d8..5799e9a94f1f 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019-2023 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> @@ -8,14 +8,17 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_health.h" #include "xfs_btree.h" #include "xfs_ag.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_inode.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -53,6 +56,7 @@ struct xchk_fscounters { uint64_t frextents; unsigned long long icount_min; unsigned long long icount_max; + bool frozen; }; /* @@ -123,6 +127,82 @@ xchk_fscount_warmup( return error; } +static inline int +xchk_fsfreeze( + struct xfs_scrub *sc) +{ + int error; + + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsfreeze(sc, error); + return error; +} + +static inline int +xchk_fsthaw( + struct xfs_scrub *sc) +{ + int error; + + /* This should always succeed, we have a kernel freeze */ + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsthaw(sc, error); + return error; +} + +/* + * We couldn't stabilize the filesystem long enough to sample all the variables + * that comprise the summary counters and compare them to the percpu counters. + * We need to disable all writer threads, which means taking the first two + * freeze levels to put userspace to sleep, and the third freeze level to + * prevent background threads from starting new transactions. Take one level + * more to prevent other callers from unfreezing the filesystem while we run. + */ +STATIC int +xchk_fscounters_freeze( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc = sc->buf; + int error = 0; + + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + + /* Try to grab a kernel freeze. */ + while ((error = xchk_fsfreeze(sc)) == -EBUSY) { + if (xchk_should_terminate(sc, &error)) + return error; + + delay(HZ / 10); + } + if (error) + return error; + + fsc->frozen = true; + return 0; +} + +/* Thaw the filesystem after checking or repairing fscounters. */ +STATIC void +xchk_fscounters_cleanup( + void *buf) +{ + struct xchk_fscounters *fsc = buf; + struct xfs_scrub *sc = fsc->sc; + int error; + + if (!fsc->frozen) + return; + + error = xchk_fsthaw(sc); + if (error) + xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); + else + fsc->frozen = false; +} + int xchk_setup_fscounters( struct xfs_scrub *sc) @@ -140,6 +220,7 @@ xchk_setup_fscounters( sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; + sc->buf_cleanup = xchk_fscounters_cleanup; fsc = sc->buf; fsc->sc = sc; @@ -150,7 +231,18 @@ xchk_setup_fscounters( if (error) return error; - return xchk_trans_alloc(sc, 0); + /* + * Pause all writer activity in the filesystem while we're scrubbing to + * reduce the likelihood of background perturbations to the counters + * throwing off our calculations. + */ + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_fscounters_freeze(sc); + if (error) + return error; + } + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); } /* @@ -290,8 +382,7 @@ retry: if (fsc->ifree > fsc->icount) { if (tries--) goto retry; - xchk_set_incomplete(sc); - return 0; + return -EDEADLOCK; } return 0; @@ -367,6 +458,8 @@ xchk_fscount_count_frextents( * Otherwise, we /might/ have a problem. If the change in the summations is * more than we want to tolerate, the filesystem is probably busy and we should * just send back INCOMPLETE and see if userspace will try again. + * + * If we're repairing then we require an exact match. */ static inline bool xchk_fscount_within_range( @@ -396,21 +489,7 @@ xchk_fscount_within_range( if (expected >= min_value && expected <= max_value) return true; - /* - * If the difference between the two summations is too large, the fs - * might just be busy and so we'll mark the scrub incomplete. Return - * true here so that we don't mark the counter corrupt. - * - * XXX: In the future when userspace can grant scrub permission to - * quiesce the filesystem to solve the outsized variance problem, this - * check should be moved up and the return code changed to signal to - * userspace that we need quiesce permission. - */ - if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { - xchk_set_incomplete(sc); - return true; - } - + /* Everything else is bad. */ return false; } @@ -422,6 +501,7 @@ xchk_fscounters( struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; int64_t icount, ifree, fdblocks, frextents; + bool try_again = false; int error; /* Snapshot the percpu counters. */ @@ -431,9 +511,26 @@ xchk_fscounters( frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) + if (icount < 0 || ifree < 0) xchk_set_corrupt(sc); + /* + * If the filesystem is not frozen, the counter summation calls above + * can race with xfs_mod_freecounter, which subtracts a requested space + * reservation from the counter and undoes the subtraction if that made + * the counter go negative. Therefore, it's possible to see negative + * values here, and we should only flag that as a corruption if we + * froze the fs. This is much more likely to happen with frextents + * since there are no reserved pools. + */ + if (fdblocks < 0 || frextents < 0) { + if (!fsc->frozen) + return -EDEADLOCK; + + xchk_set_corrupt(sc); + return 0; + } + /* See if icount is obviously wrong. */ if (icount < fsc->icount_min || icount > fsc->icount_max) xchk_set_corrupt(sc); @@ -447,12 +544,6 @@ xchk_fscounters( xchk_set_corrupt(sc); /* - * XXX: We can't quiesce percpu counter updates, so exit early. - * This can be re-enabled when we gain exclusive freeze functionality. - */ - return 0; - - /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. */ @@ -463,8 +554,6 @@ xchk_fscounters( error = xchk_fscount_aggregate_agcounts(sc, fsc); if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) - return 0; /* Count the free extents counter for rt volumes. */ error = xchk_fscount_count_frextents(sc, fsc); @@ -473,20 +562,45 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; - /* Compare the in-core counters with whatever we counted. */ - if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) - xchk_set_corrupt(sc); + /* + * Compare the in-core counters with whatever we counted. If the fs is + * frozen, we treat the discrepancy as a corruption because the freeze + * should have stabilized the counter values. Otherwise, we need + * userspace to call us back having granted us freeze permission. + */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, + fsc->icount)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } - if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) - xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) - xchk_set_corrupt(sc); + fsc->fdblocks)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) - xchk_set_corrupt(sc); + fsc->frextents)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (try_again) + return -EDEADLOCK; return 0; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index d2b2a1cb6533..5e2b09ed6e29 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -226,6 +226,16 @@ xchk_ag_btree_healthy_enough( return true; } + /* + * If we just repaired some AG metadata, sc->sick_mask will reflect all + * the per-AG metadata types that were repaired. Exclude these from + * the filesystem health query because we have not yet updated the + * health status and we want everything to be scanned. + */ + if ((sc->flags & XREP_ALREADY_FIXED) && + type_to_health_flag[sc->sm->sm_type].group == XHG_AG) + mask &= ~sc->sick_mask; + if (xfs_ag_has_sickness(pag, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; return false; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 575f22a02ebe..fb7bbf47ae5d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -328,8 +328,7 @@ xchk_iallocbt_check_cluster_ifree( goto out; } - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, - &ino_inuse); + error = xchk_inode_is_allocated(bs->sc, agino, &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ freemask_ok = irec_free ^ !!(dip->di_mode); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 3e1e02e340a6..889f556bc98f 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -20,6 +20,7 @@ #include "xfs_reflink.h" #include "xfs_rmap.h" #include "xfs_bmap_util.h" +#include "xfs_rtbitmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" @@ -32,15 +33,13 @@ xchk_prepare_iscrub( { int error; - sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_IOLOCK_EXCL); error = xchk_trans_alloc(sc, 0); if (error) return error; - sc->ilock_flags |= XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -83,7 +82,10 @@ xchk_setup_inode( /* We want to scan the opened inode, so lock it and exit. */ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { - sc->ip = ip_in; + error = xchk_install_live_inode(sc, ip_in); + if (error) + return error; + return xchk_prepare_iscrub(sc); } @@ -224,7 +226,7 @@ xchk_inode_extsize( */ if ((flags & XFS_DIFLAG_RTINHERIT) && (flags & XFS_DIFLAG_EXTSZINHERIT) && - value % sc->mp->m_sb.sb_rextsize > 0) + xfs_extlen_to_rtxmod(sc->mp, value) > 0) xchk_ino_set_warning(sc, ino); } diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 58d5dfb7ea21..e6155d86f791 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -150,8 +150,8 @@ xchk_parent_validate( lock_mode = xchk_parent_ilock_dir(dp); if (!lock_mode) { - xfs_iunlock(sc->ip, XFS_ILOCK_EXCL); - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + xchk_ilock(sc, XFS_ILOCK_EXCL); error = -EAGAIN; goto out_rele; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index e6caa358cbda..5671c8153433 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -59,9 +59,12 @@ xchk_setup_quota( error = xchk_setup_fs(sc); if (error) return error; - sc->ip = xfs_quota_inode(sc->mp, dqtype); - xfs_ilock(sc->ip, XFS_ILOCK_EXCL); - sc->ilock_flags = XFS_ILOCK_EXCL; + + error = xchk_install_live_inode(sc, xfs_quota_inode(sc->mp, dqtype)); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } @@ -235,13 +238,11 @@ xchk_quota( * data fork we have to drop ILOCK_EXCL to use the regular dquot * functions. */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = 0; + xchk_iunlock(sc, sc->ilock_flags); sqi.sc = sc; sqi.last_id = 0; error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); - sc->ilock_flags = XFS_ILOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); + xchk_ilock(sc, XFS_ILOCK_EXCL); if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c new file mode 100644 index 000000000000..86a62420e02c --- /dev/null +++ b/fs/xfs/scrub/reap.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_attr.h" +#include "xfs_attr_remote.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/reap.h" + +/* + * Disposal of Blocks from Old Metadata + * + * Now that we've constructed a new btree to replace the damaged one, we want + * to dispose of the blocks that (we think) the old btree was using. + * Previously, we used the rmapbt to collect the extents (bitmap) with the + * rmap owner corresponding to the tree we rebuilt, collected extents for any + * blocks with the same rmap owner that are owned by another data structure + * (sublist), and subtracted sublist from bitmap. In theory the extents + * remaining in bitmap are the old btree's blocks. + * + * Unfortunately, it's possible that the btree was crosslinked with other + * blocks on disk. The rmap data can tell us if there are multiple owners, so + * if the rmapbt says there is an owner of this block other than @oinfo, then + * the block is crosslinked. Remove the reverse mapping and continue. + * + * If there is one rmap record, we can free the block, which removes the + * reverse mapping but doesn't add the block to the free space. Our repair + * strategy is to hope the other metadata objects crosslinked on this block + * will be rebuilt (atop different blocks), thereby removing all the cross + * links. + * + * If there are no rmap records at all, we also free the block. If the btree + * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't + * supposed to be a rmap record and everything is ok. For other btrees there + * had to have been an rmap entry for the block to have ended up on @bitmap, + * so if it's gone now there's something wrong and the fs will shut down. + * + * Note: If there are multiple rmap records with only the same rmap owner as + * the btree we're trying to rebuild and the block is indeed owned by another + * data structure with the same rmap owner, then the block will be in sublist + * and therefore doesn't need disposal. If there are multiple rmap records + * with only the same rmap owner but the block is not owned by something with + * the same rmap owner, the block will be freed. + * + * The caller is responsible for locking the AG headers for the entire rebuild + * operation so that nothing else can sneak in and change the AG state while + * we're not looking. We must also invalidate any buffers associated with + * @bitmap. + */ + +/* Information about reaping extents after a repair. */ +struct xreap_state { + struct xfs_scrub *sc; + + /* Reverse mapping owner and metadata reservation type. */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + + /* If true, roll the transaction before reaping the next extent. */ + bool force_roll; + + /* Number of deferred reaps attached to the current transaction. */ + unsigned int deferred; + + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int invalidated; + + /* Number of deferred reaps queued during the whole reap sequence. */ + unsigned long long total_deferred; +}; + +/* Put a block back on the AGFL. */ +STATIC int +xreap_put_freelist( + struct xfs_scrub *sc, + xfs_agblock_t agbno) +{ + struct xfs_buf *agfl_bp; + int error; + + /* Make sure there's space on the freelist. */ + error = xrep_fix_freelist(sc, true); + if (error) + return error; + + /* + * Since we're "freeing" a lost block onto the AGFL, we have to + * create an rmap for the block prior to merging it or else other + * parts will break. + */ + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, + &XFS_RMAP_OINFO_AG); + if (error) + return error; + + /* Put the block on the AGFL. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, + agfl_bp, agbno, 0); + if (error) + return error; + xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + + return 0; +} + +/* Are there any uncommitted reap operations? */ +static inline bool xreap_dirty(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred) + return true; + if (rs->invalidated) + return true; + if (rs->total_deferred) + return true; + return false; +} + +#define XREAP_MAX_BINVAL (2048) + +/* + * Decide if we want to roll the transaction after reaping an extent. We don't + * want to overrun the transaction reservation, so we prohibit more than + * 128 EFIs per transaction. For the same reason, we limit the number + * of buffer invalidations to 2048. + */ +static inline bool xreap_want_roll(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) + return true; + if (rs->invalidated > XREAP_MAX_BINVAL) + return true; + return false; +} + +static inline void xreap_reset(struct xreap_state *rs) +{ + rs->total_deferred += rs->deferred; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +#define XREAP_MAX_DEFER_CHAIN (2048) + +/* + * Decide if we want to finish the deferred ops that are attached to the scrub + * transaction. We don't want to queue huge chains of deferred ops because + * that can consume a lot of log space and kernel memory. Hence we trigger a + * xfs_defer_finish if there are more than 2048 deferred reap operations or the + * caller did some real work. + */ +static inline bool +xreap_want_defer_finish(const struct xreap_state *rs) +{ + if (rs->force_roll) + return true; + if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) + return true; + return false; +} + +static inline void xreap_defer_finish_reset(struct xreap_state *rs) +{ + rs->total_deferred = 0; + rs->deferred = 0; + rs->invalidated = 0; + rs->force_roll = false; +} + +/* Try to invalidate the incore buffers for an extent that we're freeing. */ +STATIC void +xreap_agextent_binval( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; + xfs_agblock_t agbno_next = agbno + *aglenp; + xfs_agblock_t bno = agbno; + + /* + * Avoid invalidating AG headers and post-EOFS blocks because we never + * own those. + */ + if (!xfs_verify_agbno(pag, agbno) || + !xfs_verify_agbno(pag, agbno_next - 1)) + return; + + /* + * If there are incore buffers for these blocks, invalidate them. We + * assume that the lack of any other known owners means that the buffer + * can be locked without risk of deadlocking. The buffer cache cannot + * detect aliasing, so employ nested loops to scan for incore buffers + * of any plausible size. + */ + while (bno < agbno_next) { + xfs_agblock_t fsbcount; + xfs_agblock_t max_fsbs; + + /* + * Max buffer size is the max remote xattr buffer size, which + * is one fs block larger than 64k. + */ + max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, + xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); + + for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + struct xfs_buf *bp = NULL; + xfs_daddr_t daddr; + int error; + + daddr = XFS_AGB_TO_DADDR(mp, agno, bno); + error = xfs_buf_incore(mp->m_ddev_targp, daddr, + XFS_FSB_TO_BB(mp, fsbcount), + XBF_LIVESCAN, &bp); + if (error) + continue; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + rs->invalidated++; + + /* + * Stop invalidating if we've hit the limit; we should + * still have enough reservation left to free however + * far we've gotten. + */ + if (rs->invalidated > XREAP_MAX_BINVAL) { + *aglenp -= agbno_next - bno; + goto out; + } + } + + bno++; + } + +out: + trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp); +} + +/* + * Figure out the longest run of blocks that we can dispose of with a single + * call. Cross-linked blocks should have their reverse mappings removed, but + * single-owner extents can be freed. AGFL blocks can only be put back one at + * a time. + */ +STATIC int +xreap_agextent_select( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_agblock_t agbno_next, + bool *crosslinked, + xfs_extlen_t *aglenp) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_btree_cur *cur; + xfs_agblock_t bno = agbno + 1; + xfs_extlen_t len = 1; + int error; + + /* + * Determine if there are any other rmap records covering the first + * block of this extent. If so, the block is crosslinked. + */ + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, + crosslinked); + if (error) + goto out_cur; + + /* AGFL blocks can only be deal with one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) + goto out_found; + + /* + * Figure out how many of the subsequent blocks have the same crosslink + * status. + */ + while (bno < agbno_next) { + bool also_crosslinked; + + error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo, + &also_crosslinked); + if (error) + goto out_cur; + + if (*crosslinked != also_crosslinked) + break; + + len++; + bno++; + } + +out_found: + *aglenp = len; + trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Dispose of as much of the beginning of this AG extent as possible. The + * number of blocks disposed of will be returned in @aglenp. + */ +STATIC int +xreap_agextent_iter( + struct xreap_state *rs, + xfs_agblock_t agbno, + xfs_extlen_t *aglenp, + bool crosslinked) +{ + struct xfs_scrub *sc = rs->sc; + xfs_fsblock_t fsbno; + int error = 0; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the reverse mapping and move on. Otherwise, + * we were the only owner of the block, so free the extent, which will + * also remove the rmap. + * + * XXX: XFS doesn't support detecting the case where a single block + * metadata structure is crosslinked with a multi-block structure + * because the buffer cache doesn't detect aliasing problems, so we + * can't fix 100% of crosslinking problems (yet). The verifiers will + * blow on writeout, the filesystem will shut down, and the admin gets + * to run xfs_repair. + */ + if (crosslinked) { + trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); + + rs->force_roll = true; + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, + *aglenp, rs->oinfo); + } + + trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp); + + /* + * Invalidate as many buffers as we can, starting at agbno. If this + * function sets *aglenp to zero, the transaction is full of logged + * buffer invalidations, so we need to return early so that we can + * roll and retry. + */ + xreap_agextent_binval(rs, agbno, aglenp); + if (*aglenp == 0) { + ASSERT(xreap_want_roll(rs)); + return 0; + } + + /* Put blocks back on the AGFL one at a time. */ + if (rs->resv == XFS_AG_RESV_AGFL) { + ASSERT(*aglenp == 1); + error = xreap_put_freelist(sc, agbno); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + + /* + * Use deferred frees to get rid of the old btree blocks to try to + * minimize the window in which we could crash and lose the old blocks. + */ + error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + rs->resv, true); + if (error) + return error; + + rs->deferred++; + return 0; +} + +/* + * Break an AG metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. + */ +STATIC int +xreap_agmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agblock_t agbno = fsbno; + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip == NULL); + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + return error; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + return error; + + if (xreap_want_defer_finish(rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + error = xrep_roll_ag_trans(sc); + if (error) + return error; + xreap_reset(rs); + } + + agbno += aglen; + } + + return 0; +} + +/* Dispose of every block of every AG metadata extent in the bitmap. */ +int +xrep_reap_agblocks( + struct xfs_scrub *sc, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = type, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip == NULL); + + error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h new file mode 100644 index 000000000000..fe24626af164 --- /dev/null +++ b/fs/xfs/scrub/reap.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_REAP_H__ +#define __XFS_SCRUB_REAP_H__ + +int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + +#endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index ac6d8803e660..1b8b5439f2d7 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -26,11 +26,13 @@ #include "xfs_ag_resv.h" #include "xfs_quota.h" #include "xfs_qm.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/stats.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -39,8 +41,10 @@ */ int xrep_attempt( - struct xfs_scrub *sc) + struct xfs_scrub *sc, + struct xchk_stats_run *run) { + u64 repair_start; int error = 0; trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error); @@ -49,8 +53,11 @@ xrep_attempt( /* Repair whatever's broken. */ ASSERT(sc->ops->repair); + run->repair_attempted = true; + repair_start = xchk_stats_now(); error = sc->ops->repair(sc); trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error); + run->repair_ns += xchk_stats_elapsed_ns(repair_start); switch (error) { case 0: /* @@ -59,14 +66,17 @@ xrep_attempt( */ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; sc->flags |= XREP_ALREADY_FIXED; + run->repair_succeeded = true; return -EAGAIN; case -ECHRNG: sc->flags |= XCHK_NEED_DRAIN; + run->retries++; return -EAGAIN; case -EDEADLOCK: /* Tell the caller to try again having grabbed all the locks. */ if (!(sc->flags & XCHK_TRY_HARDER)) { sc->flags |= XCHK_TRY_HARDER; + run->retries++; return -EAGAIN; } /* @@ -166,6 +176,56 @@ xrep_roll_ag_trans( return 0; } +/* Finish all deferred work attached to the repair transaction. */ +int +xrep_defer_finish( + struct xfs_scrub *sc) +{ + int error; + + /* + * Keep the AG header buffers locked while we complete deferred work + * items. Ensure that both AG buffers are dirty and held when we roll + * the transaction so that they move forward in the log without losing + * the bli (and hence the bli type) when the transaction commits. + * + * Normal code would never hold clean buffers across a roll, but repair + * needs both buffers to maintain a total lock on the AG. + */ + if (sc->sa.agi_bp) { + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + } + + if (sc->sa.agf_bp) { + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + } + + /* + * Finish all deferred work items. We still hold the AG header buffers + * locked regardless of whether or not that succeeds. On failure, the + * buffers will be released during teardown on our way out of the + * kernel. If successful, join the buffers to the new transaction + * and move on. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* + * Release the hold that we set above because defer_finish won't do + * that for us. The defer roll code redirties held buffers after each + * roll, so the AG header buffers should be ready for logging. + */ + if (sc->sa.agi_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); + if (sc->sa.agf_bp) + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); + + return 0; +} + /* * Does the given AG have enough space to rebuild a btree? Neither AG * reservation can be critical, and we must have enough space (factoring @@ -297,89 +357,6 @@ xrep_calc_ag_resblks( return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); } -/* Allocate a block in an AG. */ -int -xrep_alloc_ag_block( - struct xfs_scrub *sc, - const struct xfs_owner_info *oinfo, - xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv) -{ - struct xfs_alloc_arg args = {0}; - xfs_agblock_t bno; - int error; - - switch (resv) { - case XFS_AG_RESV_AGFL: - case XFS_AG_RESV_RMAPBT: - error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp, - sc->sa.agf_bp, &bno, 1); - if (error) - return error; - if (bno == NULLAGBLOCK) - return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); - *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); - if (resv == XFS_AG_RESV_RMAPBT) - xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); - return 0; - default: - break; - } - - args.tp = sc->tp; - args.mp = sc->mp; - args.pag = sc->sa.pag; - args.oinfo = *oinfo; - args.minlen = 1; - args.maxlen = 1; - args.prod = 1; - args.resv = resv; - - error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno); - if (error) - return error; - if (args.fsbno == NULLFSBLOCK) - return -ENOSPC; - ASSERT(args.len == 1); - *fsbno = args.fsbno; - - return 0; -} - -/* Initialize a new AG btree root block with zero entries. */ -int -xrep_init_btblock( - struct xfs_scrub *sc, - xfs_fsblock_t fsb, - struct xfs_buf **bpp, - xfs_btnum_t btnum, - const struct xfs_buf_ops *ops) -{ - struct xfs_trans *tp = sc->tp; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp; - int error; - - trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), - XFS_FSB_TO_AGBNO(mp, fsb), btnum); - - ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno); - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, - &bp); - if (error) - return error; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno); - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); - xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); - bp->b_ops = ops; - *bpp = bp; - - return 0; -} - /* * Reconstructing per-AG Btrees * @@ -404,91 +381,8 @@ xrep_init_btblock( * sublist. As with the other btrees we subtract sublist from bitmap, and the * result (since the rmapbt lives in the free space) are the blocks from the * old rmapbt. - * - * Disposal of Blocks from Old per-AG Btrees - * - * Now that we've constructed a new btree to replace the damaged one, we want - * to dispose of the blocks that (we think) the old btree was using. - * Previously, we used the rmapbt to collect the extents (bitmap) with the - * rmap owner corresponding to the tree we rebuilt, collected extents for any - * blocks with the same rmap owner that are owned by another data structure - * (sublist), and subtracted sublist from bitmap. In theory the extents - * remaining in bitmap are the old btree's blocks. - * - * Unfortunately, it's possible that the btree was crosslinked with other - * blocks on disk. The rmap data can tell us if there are multiple owners, so - * if the rmapbt says there is an owner of this block other than @oinfo, then - * the block is crosslinked. Remove the reverse mapping and continue. - * - * If there is one rmap record, we can free the block, which removes the - * reverse mapping but doesn't add the block to the free space. Our repair - * strategy is to hope the other metadata objects crosslinked on this block - * will be rebuilt (atop different blocks), thereby removing all the cross - * links. - * - * If there are no rmap records at all, we also free the block. If the btree - * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't - * supposed to be a rmap record and everything is ok. For other btrees there - * had to have been an rmap entry for the block to have ended up on @bitmap, - * so if it's gone now there's something wrong and the fs will shut down. - * - * Note: If there are multiple rmap records with only the same rmap owner as - * the btree we're trying to rebuild and the block is indeed owned by another - * data structure with the same rmap owner, then the block will be in sublist - * and therefore doesn't need disposal. If there are multiple rmap records - * with only the same rmap owner but the block is not owned by something with - * the same rmap owner, the block will be freed. - * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We also assume that the caller already invalidated any - * buffers associated with @bitmap. */ -static int -xrep_invalidate_block( - uint64_t fsbno, - void *priv) -{ - struct xfs_scrub *sc = priv; - struct xfs_buf *bp; - int error; - - /* Skip AG headers and post-EOFS blocks */ - if (!xfs_verify_fsbno(sc->mp, fsbno)) - return 0; - - error = xfs_buf_incore(sc->mp->m_ddev_targp, - XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); - if (error) - return 0; - - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - return 0; -} - -/* - * Invalidate buffers for per-AG btree blocks we're dumping. This function - * is not intended for use with file data repairs; we have bunmapi for that. - */ -int -xrep_invalidate_blocks( - struct xfs_scrub *sc, - struct xbitmap *bitmap) -{ - /* - * For each block in each extent, see if there's an incore buffer for - * exactly that block; if so, invalidate it. The buffer cache only - * lets us look for one buffer at a time, so we have to look one block - * at a time. Avoid invalidating AG headers and post-EOFS blocks - * because we never own those; and if we can't TRYLOCK the buffer we - * assume it's owned by someone else. - */ - return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc); -} - /* Ensure the freelist is the correct size. */ int xrep_fix_freelist( @@ -507,155 +401,6 @@ xrep_fix_freelist( can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); } -/* Information about reaping extents after a repair. */ -struct xrep_reap_state { - struct xfs_scrub *sc; - - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; -}; - -/* - * Put a block back on the AGFL. - */ -STATIC int -xrep_put_freelist( - struct xfs_scrub *sc, - xfs_agblock_t agbno) -{ - struct xfs_buf *agfl_bp; - int error; - - /* Make sure there's space on the freelist. */ - error = xrep_fix_freelist(sc, true); - if (error) - return error; - - /* - * Since we're "freeing" a lost block onto the AGFL, we have to - * create an rmap for the block prior to merging it or else other - * parts will break. - */ - error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, - &XFS_RMAP_OINFO_AG); - if (error) - return error; - - /* Put the block on the AGFL. */ - error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); - if (error) - return error; - - error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, - agfl_bp, agbno, 0); - if (error) - return error; - xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, - XFS_EXTENT_BUSY_SKIP_DISCARD); - - return 0; -} - -/* Dispose of a single block. */ -STATIC int -xrep_reap_block( - uint64_t fsbno, - void *priv) -{ - struct xrep_reap_state *rs = priv; - struct xfs_scrub *sc = rs->sc; - struct xfs_btree_cur *cur; - struct xfs_buf *agf_bp = NULL; - xfs_agblock_t agbno; - bool has_other_rmap; - int error; - - ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - trace_xrep_dispose_btree_extent(sc->mp, - XFS_FSB_TO_AGNO(sc->mp, fsbno), - XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); - - agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); - - /* - * If we are repairing per-inode metadata, we need to read in the AGF - * buffer. Otherwise, we're repairing a per-AG structure, so reuse - * the AGF buffer that the setup functions already grabbed. - */ - if (sc->ip) { - error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); - if (error) - return error; - } else { - agf_bp = sc->sa.agf_bp; - } - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); - - /* Can we find any other rmappings? */ - error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo, - &has_other_rmap); - xfs_btree_del_cursor(cur, error); - if (error) - goto out_free; - - /* - * If there are other rmappings, this block is cross linked and must - * not be freed. Remove the reverse mapping and move on. Otherwise, - * we were the only owner of the block, so free the extent, which will - * also remove the rmap. - * - * XXX: XFS doesn't support detecting the case where a single block - * metadata structure is crosslinked with a multi-block structure - * because the buffer cache doesn't detect aliasing problems, so we - * can't fix 100% of crosslinking problems (yet). The verifiers will - * blow on writeout, the filesystem will shut down, and the admin gets - * to run xfs_repair. - */ - if (has_other_rmap) - error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, - 1, rs->oinfo); - else if (rs->resv == XFS_AG_RESV_AGFL) - error = xrep_put_freelist(sc, agbno); - else - error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo, - rs->resv); - if (agf_bp != sc->sa.agf_bp) - xfs_trans_brelse(sc->tp, agf_bp); - if (error) - return error; - - if (sc->ip) - return xfs_trans_roll_inode(&sc->tp, sc->ip); - return xrep_roll_ag_trans(sc); - -out_free: - if (agf_bp != sc->sa.agf_bp) - xfs_trans_brelse(sc->tp, agf_bp); - return error; -} - -/* Dispose of every block of every extent in the bitmap. */ -int -xrep_reap_extents( - struct xfs_scrub *sc, - struct xbitmap *bitmap, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - struct xrep_reap_state rs = { - .sc = sc, - .oinfo = oinfo, - .resv = type, - }; - - ASSERT(xfs_has_rmapbt(sc->mp)); - - return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs); -} - /* * Finding per-AG Btree Roots for AGF/AGI Reconstruction * diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index dce791c679ee..60d2a9ae5f2e 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -8,6 +8,8 @@ #include "xfs_quota_defs.h" +struct xchk_stats_run; + static inline int xrep_notsupported(struct xfs_scrub *sc) { return -EOPNOTSUPP; @@ -15,28 +17,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR +/* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + + /* Repair helpers */ -int xrep_attempt(struct xfs_scrub *sc); +int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); -int xrep_alloc_ag_block(struct xfs_scrub *sc, - const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv); -int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, - struct xfs_buf **bpp, xfs_btnum_t btnum, - const struct xfs_buf_ops *ops); struct xbitmap; struct xagb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); -int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); -int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist, - const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -70,7 +72,8 @@ int xrep_agi(struct xfs_scrub *sc); static inline int xrep_attempt( - struct xfs_scrub *sc) + struct xfs_scrub *sc, + struct xchk_stats_run *run) { return -EOPNOTSUPP; } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index e7dace7b4be8..41a1d89ae8e6 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -11,7 +11,7 @@ #include "xfs_mount.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "scrub/scrub.h" @@ -19,19 +19,20 @@ /* Set us up with the realtime metadata locked. */ int -xchk_setup_rt( +xchk_setup_rtbitmap( struct xfs_scrub *sc) { int error; - error = xchk_setup_fs(sc); + error = xchk_trans_alloc(sc, 0); if (error) return error; - sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP; - sc->ip = sc->mp->m_rbmip; - xfs_ilock(sc->ip, sc->ilock_flags); + error = xchk_install_live_inode(sc, sc->mp->m_rbmip); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); return 0; } @@ -47,12 +48,12 @@ xchk_rtbitmap_rec( { struct xfs_scrub *sc = priv; xfs_rtblock_t startblock; - xfs_rtblock_t blockcount; + xfs_filblks_t blockcount; - startblock = rec->ar_startext * mp->m_sb.sb_rextsize; - blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; + startblock = xfs_rtx_to_rtb(mp, rec->ar_startext); + blockcount = xfs_rtx_to_rtb(mp, rec->ar_extcount); - if (!xfs_verify_rtext(mp, startblock, blockcount)) + if (!xfs_verify_rtbext(mp, startblock, blockcount)) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -123,67 +124,26 @@ out: return error; } -/* Scrub the realtime summary. */ -int -xchk_rtsummary( - struct xfs_scrub *sc) -{ - struct xfs_inode *rsumip = sc->mp->m_rsumip; - struct xfs_inode *old_ip = sc->ip; - uint old_ilock_flags = sc->ilock_flags; - int error = 0; - - /* - * We ILOCK'd the rt bitmap ip in the setup routine, now lock the - * rt summary ip in compliance with the rt inode locking rules. - * - * Since we switch sc->ip to rsumip we have to save the old ilock - * flags so that we don't mix up the inode state that @sc tracks. - */ - sc->ip = rsumip; - sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM; - xfs_ilock(sc->ip, sc->ilock_flags); - - /* Invoke the fork scrubber. */ - error = xchk_metadata_inode_forks(sc); - if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) - goto out; - - /* XXX: implement this some day */ - xchk_set_incomplete(sc); -out: - /* Switch back to the rtbitmap inode and lock flags. */ - xfs_iunlock(sc->ip, sc->ilock_flags); - sc->ilock_flags = old_ilock_flags; - sc->ip = old_ip; - return error; -} - - /* xref check that the extent is not free in the rtbitmap */ void xchk_xref_is_used_rt_space( struct xfs_scrub *sc, - xfs_rtblock_t fsbno, + xfs_rtblock_t rtbno, xfs_extlen_t len) { - xfs_rtblock_t startext; - xfs_rtblock_t endext; - xfs_rtblock_t extcount; + xfs_rtxnum_t startext; + xfs_rtxnum_t endext; bool is_free; int error; if (xchk_skip_xref(sc->sm)) return; - startext = fsbno; - endext = fsbno + len - 1; - do_div(startext, sc->mp->m_sb.sb_rextsize); - do_div(endext, sc->mp->m_sb.sb_rextsize); - extcount = endext - startext + 1; + startext = xfs_rtb_to_rtx(sc->mp, rtbno); + endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, - &is_free); + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, + endext - startext + 1, &is_free); if (!xchk_should_check_xref(sc, &error, NULL)) goto out_unlock; if (is_free) diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c new file mode 100644 index 000000000000..8b15c47408d0 --- /dev/null +++ b/fs/xfs/scrub/rtsummary.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_rtbitmap.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/xfile.h" + +/* + * Realtime Summary + * ================ + * + * We check the realtime summary by scanning the realtime bitmap file to create + * a new summary file incore, and then we compare the computed version against + * the ondisk version. We use the 'xfile' functionality to store this + * (potentially large) amount of data in pageable memory. + */ + +/* Set us up to check the rtsummary file. */ +int +xchk_setup_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* + * Create an xfile to construct a new rtsummary file. The xfile allows + * us to avoid pinning kernel memory for this purpose. + */ + descr = xchk_xfile_descr(sc, "realtime summary file"); + error = xfile_create(descr, mp->m_rsumsize, &sc->xfile); + kfree(descr); + if (error) + return error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + /* Allocate a memory buffer for the summary comparison. */ + sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + error = xchk_install_live_inode(sc, mp->m_rsumip); + if (error) + return error; + + /* + * Locking order requires us to take the rtbitmap first. We must be + * careful to unlock it ourselves when we are done with the rtbitmap + * file since the scrub infrastructure won't do that for us. Only + * then we can lock the rtsummary inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + return 0; +} + +/* Helper functions to record suminfo words in an xfile. */ + +typedef unsigned int xchk_rtsumoff_t; + +static inline int +xfsum_load( + struct xfs_scrub *sc, + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo) +{ + return xfile_obj_load(sc->xfile, rawinfo, + sizeof(union xfs_suminfo_raw), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_store( + struct xfs_scrub *sc, + xfs_rtsumoff_t sumoff, + const union xfs_suminfo_raw rawinfo) +{ + return xfile_obj_store(sc->xfile, &rawinfo, + sizeof(union xfs_suminfo_raw), + sumoff << XFS_WORDLOG); +} + +static inline int +xfsum_copyout( + struct xfs_scrub *sc, + xfs_rtsumoff_t sumoff, + union xfs_suminfo_raw *rawinfo, + unsigned int nr_words) +{ + return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, + sumoff << XFS_WORDLOG); +} + +static inline xfs_suminfo_t +xchk_rtsum_inc( + struct xfs_mount *mp, + union xfs_suminfo_raw *v) +{ + v->old += 1; + return v->old; +} + +/* Update the summary file to reflect the free extent that we've accumulated. */ +STATIC int +xchk_rtsum_record_free( + struct xfs_mount *mp, + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_scrub *sc = priv; + xfs_fileoff_t rbmoff; + xfs_rtblock_t rtbno; + xfs_filblks_t rtlen; + xfs_rtsumoff_t offs; + unsigned int lenlog; + union xfs_suminfo_raw v; + xfs_suminfo_t value; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + /* Compute the relevant location in the rtsum file. */ + rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext); + lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); + offs = xfs_rtsumoffs(mp, lenlog, rbmoff); + + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); + rtlen = xfs_rtx_to_rtb(mp, rec->ar_extcount); + + if (!xfs_verify_rtbext(mp, rtbno, rtlen)) { + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + return -EFSCORRUPTED; + } + + /* Bump the summary count. */ + error = xfsum_load(sc, offs, &v); + if (error) + return error; + + value = xchk_rtsum_inc(sc->mp, &v); + trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount, + lenlog, offs, value); + + return xfsum_store(sc, offs, v); +} + +/* Compute the realtime summary from the realtime bitmap. */ +STATIC int +xchk_rtsum_compute( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long rtbmp_blocks; + + /* If the bitmap size doesn't match the computed size, bail. */ + rtbmp_blocks = xfs_rtbitmap_blockcount(mp, mp->m_sb.sb_rextents); + if (XFS_FSB_TO_B(mp, rtbmp_blocks) != mp->m_rbmip->i_disk_size) + return -EFSCORRUPTED; + + return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free, + sc); +} + +/* Compare the rtsummary file against the one we computed. */ +STATIC int +xchk_rtsum_compare( + struct xfs_scrub *sc) +{ + struct xfs_rtalloc_args args = { + .mp = sc->mp, + .tp = sc->tp, + }; + struct xfs_mount *mp = sc->mp; + struct xfs_bmbt_irec map; + xfs_fileoff_t off; + xchk_rtsumoff_t sumoff = 0; + int nmap; + + for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { + union xfs_suminfo_raw *ondisk_info; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Make sure we have a written extent. */ + nmap = 1; + error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + XFS_DATA_FORK); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + return 0; + } + + /* Read a block's worth of ondisk rtsummary file. */ + error = xfs_rtsummary_read_buf(&args, off); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) + return error; + + /* Read a block's worth of computed rtsummary file. */ + error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + if (error) { + xfs_rtbuf_cache_relse(&args); + return error; + } + + ondisk_info = xfs_rsumblock_infoptr(&args, 0); + if (memcmp(ondisk_info, sc->buf, + mp->m_blockwsize << XFS_WORDLOG) != 0) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + + xfs_rtbuf_cache_relse(&args); + sumoff += mp->m_blockwsize; + } + + return 0; +} + +/* Scrub the realtime summary. */ +int +xchk_rtsummary( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + int error = 0; + + /* Invoke the fork scrubber. */ + error = xchk_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + goto out_rbm; + + /* Construct the new summary file from the rtbitmap. */ + error = xchk_rtsum_compute(sc); + if (error == -EFSCORRUPTED) { + /* + * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref + * error since we're checking the summary file. + */ + xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino); + error = 0; + goto out_rbm; + } + if (error) + goto out_rbm; + + /* Does the computed summary file match the actual rtsummary file? */ + error = xchk_rtsum_compare(sc); + +out_rbm: + /* Unlock the rtbitmap since we're done with it. */ + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); + return error; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 3d98f604765e..4849efcaa33a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -22,6 +22,8 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/health.h" +#include "scrub/stats.h" +#include "scrub/xfile.h" /* * Online Scrub and Repair @@ -166,8 +168,6 @@ xchk_teardown( struct xfs_scrub *sc, int error) { - struct xfs_inode *ip_in = XFS_I(file_inode(sc->file)); - xchk_ag_free(sc, &sc->sa); if (sc->tp) { if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) @@ -178,14 +178,18 @@ xchk_teardown( } if (sc->ip) { if (sc->ilock_flags) - xfs_iunlock(sc->ip, sc->ilock_flags); - if (sc->ip != ip_in && - !xfs_internal_inum(sc->mp, sc->ip->i_ino)) - xchk_irele(sc, sc->ip); + xchk_iunlock(sc, sc->ilock_flags); + xchk_irele(sc, sc->ip); sc->ip = NULL; } - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); + } + if (sc->xfile) { + xfile_destroy(sc->xfile); + sc->xfile = NULL; + } if (sc->buf) { if (sc->buf_cleanup) sc->buf_cleanup(sc->buf); @@ -320,14 +324,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, - .setup = xchk_setup_rt, + .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, - .setup = xchk_setup_rt, + .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .has = xfs_has_realtime, .repair = xrep_notsupported, @@ -407,6 +411,11 @@ xchk_validate_inputs( goto out; } + /* No rebuild without repair. */ + if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) && + !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return -EINVAL; + /* * We only want to repair read-write v5+ filesystems. Defer the check * for ops->repair until after our scrub confirms that we need to @@ -461,8 +470,10 @@ xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) { + struct xchk_stats_run run = { }; struct xfs_scrub *sc; struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; + u64 check_start; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -505,6 +516,8 @@ retry_op: error = mnt_want_write_file(sc->file); if (error) goto out_sc; + + sc->flags |= XCHK_HAVE_FREEZE_PROT; } /* Set up for the operation. */ @@ -517,7 +530,9 @@ retry_op: goto out_teardown; /* Scrub for errors. */ + check_start = xchk_stats_now(); error = sc->ops->scrub(sc); + run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN)) @@ -529,15 +544,16 @@ retry_op: if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !(sc->flags & XREP_ALREADY_FIXED)) { - bool needs_fix; + bool needs_fix = xchk_needs_repair(sc->sm); + + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + needs_fix = true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + needs_fix = true; - needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | - XFS_SCRUB_OFLAG_XCORRUPT | - XFS_SCRUB_OFLAG_PREEN)); /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. @@ -551,7 +567,7 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(sc); + error = xrep_attempt(sc, &run); if (error == -EAGAIN) { /* * Either the repair function succeeded or it couldn't @@ -572,6 +588,8 @@ out_nofix: out_teardown: error = xchk_teardown(sc, error); out_sc: + if (error != -ENOENT) + xchk_stats_merge(mp, sm, &run); kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); @@ -585,6 +603,7 @@ need_drain: if (error) goto out_sc; sc->flags |= XCHK_NEED_DRAIN; + run.retries++; goto retry_op; try_harder: /* @@ -596,5 +615,6 @@ try_harder: if (error) goto out_sc; sc->flags |= XCHK_TRY_HARDER; + run.retries++; goto retry_op; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index e113f2f5c254..1ef9c6b4842a 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -88,6 +88,10 @@ struct xfs_scrub { */ void (*buf_cleanup)(void *buf); + /* xfile used by the scrubbers; freed at teardown. */ + struct xfile *xfile; + + /* Lock flags for @ip. */ uint ilock_flags; /* See the XCHK/XREP state flags below. */ @@ -106,6 +110,7 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c new file mode 100644 index 000000000000..cd91db4a5548 --- /dev/null +++ b/fs/xfs/scrub/stats.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_sysfs.h" +#include "xfs_btree.h" +#include "xfs_super.h" +#include "scrub/scrub.h" +#include "scrub/stats.h" +#include "scrub/trace.h" + +struct xchk_scrub_stats { + /* all 32-bit counters here */ + + /* checking stats */ + uint32_t invocations; + uint32_t clean; + uint32_t corrupt; + uint32_t preen; + uint32_t xfail; + uint32_t xcorrupt; + uint32_t incomplete; + uint32_t warning; + uint32_t retries; + + /* repair stats */ + uint32_t repair_invocations; + uint32_t repair_success; + + /* all 64-bit items here */ + + /* runtimes */ + uint64_t checktime_us; + uint64_t repairtime_us; + + /* non-counter state must go at the end for clearall */ + spinlock_t css_lock; +}; + +struct xchk_stats { + struct dentry *cs_debugfs; + struct xchk_scrub_stats cs_stats[XFS_SCRUB_TYPE_NR]; +}; + + +static struct xchk_stats global_stats; + +static const char *name_map[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = "sb", + [XFS_SCRUB_TYPE_AGF] = "agf", + [XFS_SCRUB_TYPE_AGFL] = "agfl", + [XFS_SCRUB_TYPE_AGI] = "agi", + [XFS_SCRUB_TYPE_BNOBT] = "bnobt", + [XFS_SCRUB_TYPE_CNTBT] = "cntbt", + [XFS_SCRUB_TYPE_INOBT] = "inobt", + [XFS_SCRUB_TYPE_FINOBT] = "finobt", + [XFS_SCRUB_TYPE_RMAPBT] = "rmapbt", + [XFS_SCRUB_TYPE_REFCNTBT] = "refcountbt", + [XFS_SCRUB_TYPE_INODE] = "inode", + [XFS_SCRUB_TYPE_BMBTD] = "bmapbtd", + [XFS_SCRUB_TYPE_BMBTA] = "bmapbta", + [XFS_SCRUB_TYPE_BMBTC] = "bmapbtc", + [XFS_SCRUB_TYPE_DIR] = "directory", + [XFS_SCRUB_TYPE_XATTR] = "xattr", + [XFS_SCRUB_TYPE_SYMLINK] = "symlink", + [XFS_SCRUB_TYPE_PARENT] = "parent", + [XFS_SCRUB_TYPE_RTBITMAP] = "rtbitmap", + [XFS_SCRUB_TYPE_RTSUM] = "rtsummary", + [XFS_SCRUB_TYPE_UQUOTA] = "usrquota", + [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", + [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", + [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", +}; + +/* Format the scrub stats into a text buffer, similar to pcp style. */ +STATIC ssize_t +xchk_stats_format( + struct xchk_stats *cs, + char *buf, + size_t remaining) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + ssize_t copied = 0; + int ret = 0; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + ret = scnprintf(buf, remaining, + "%s %u %u %u %u %u %u %u %u %u %llu %u %u %llu\n", + name_map[i], + (unsigned int)css->invocations, + (unsigned int)css->clean, + (unsigned int)css->corrupt, + (unsigned int)css->preen, + (unsigned int)css->xfail, + (unsigned int)css->xcorrupt, + (unsigned int)css->incomplete, + (unsigned int)css->warning, + (unsigned int)css->retries, + (unsigned long long)css->checktime_us, + (unsigned int)css->repair_invocations, + (unsigned int)css->repair_success, + (unsigned long long)css->repairtime_us); + if (ret <= 0) + break; + + remaining -= ret; + copied += ret; + buf += ret; + } + + return copied > 0 ? copied : ret; +} + +/* Estimate the worst case buffer size required to hold the whole report. */ +STATIC size_t +xchk_stats_estimate_bufsize( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + size_t field_width; + size_t ret = 0; + + /* 4294967296 plus one space for each u32 field */ + field_width = 11 * (offsetof(struct xchk_scrub_stats, checktime_us) / + sizeof(uint32_t)); + + /* 18446744073709551615 plus one space for each u64 field */ + field_width += 21 * ((offsetof(struct xchk_scrub_stats, css_lock) - + offsetof(struct xchk_scrub_stats, checktime_us)) / + sizeof(uint64_t)); + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + if (!name_map[i]) + continue; + + /* name plus one space */ + ret += 1 + strlen(name_map[i]); + + /* all fields, plus newline */ + ret += field_width + 1; + } + + return ret; +} + +/* Clear all counters. */ +STATIC void +xchk_stats_clearall( + struct xchk_stats *cs) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) { + spin_lock(&css->css_lock); + memset(css, 0, offsetof(struct xchk_scrub_stats, css_lock)); + spin_unlock(&css->css_lock); + } +} + +#define XFS_SCRUB_OFLAG_UNCLEAN (XFS_SCRUB_OFLAG_CORRUPT | \ + XFS_SCRUB_OFLAG_PREEN | \ + XFS_SCRUB_OFLAG_XFAIL | \ + XFS_SCRUB_OFLAG_XCORRUPT | \ + XFS_SCRUB_OFLAG_INCOMPLETE | \ + XFS_SCRUB_OFLAG_WARNING) + +STATIC void +xchk_stats_merge_one( + struct xchk_stats *cs, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + struct xchk_scrub_stats *css; + + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) { + ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + return; + } + + css = &cs->cs_stats[sm->sm_type]; + spin_lock(&css->css_lock); + css->invocations++; + if (!(sm->sm_flags & XFS_SCRUB_OFLAG_UNCLEAN)) + css->clean++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + css->corrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_PREEN) + css->preen++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XFAIL) + css->xfail++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT) + css->xcorrupt++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + css->incomplete++; + if (sm->sm_flags & XFS_SCRUB_OFLAG_WARNING) + css->warning++; + css->retries += run->retries; + css->checktime_us += howmany_64(run->scrub_ns, NSEC_PER_USEC); + + if (run->repair_attempted) + css->repair_invocations++; + if (run->repair_succeeded) + css->repair_success++; + css->repairtime_us += howmany_64(run->repair_ns, NSEC_PER_USEC); + spin_unlock(&css->css_lock); +} + +/* Merge these scrub-run stats into the global and mount stat data. */ +void +xchk_stats_merge( + struct xfs_mount *mp, + const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run) +{ + xchk_stats_merge_one(&global_stats, sm, run); + xchk_stats_merge_one(mp->m_scrub_stats, sm, run); +} + +/* debugfs boilerplate */ + +static ssize_t +xchk_scrub_stats_read( + struct file *file, + char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + char *buf; + size_t bufsize; + ssize_t avail, ret; + + /* + * This generates stringly snapshot of all the scrub counters, so we + * do not want userspace to receive garbled text from multiple calls. + * If the file position is greater than 0, return a short read. + */ + if (*ppos > 0) + return 0; + + bufsize = xchk_stats_estimate_bufsize(cs); + + buf = kvmalloc(bufsize, XCHK_GFP_FLAGS); + if (!buf) + return -ENOMEM; + + avail = xchk_stats_format(cs, buf, bufsize); + if (avail < 0) { + ret = avail; + goto out; + } + + ret = simple_read_from_buffer(ubuf, count, ppos, buf, avail); +out: + kvfree(buf); + return ret; +} + +static const struct file_operations scrub_stats_fops = { + .open = simple_open, + .read = xchk_scrub_stats_read, +}; + +static ssize_t +xchk_clear_scrub_stats_write( + struct file *file, + const char __user *ubuf, + size_t count, + loff_t *ppos) +{ + struct xchk_stats *cs = file->private_data; + unsigned int val; + int ret; + + ret = kstrtouint_from_user(ubuf, count, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + + xchk_stats_clearall(cs); + return count; +} + +static const struct file_operations clear_scrub_stats_fops = { + .open = simple_open, + .write = xchk_clear_scrub_stats_write, +}; + +/* Initialize the stats object. */ +STATIC int +xchk_stats_init( + struct xchk_stats *cs, + struct xfs_mount *mp) +{ + struct xchk_scrub_stats *css = &cs->cs_stats[0]; + unsigned int i; + + for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) + spin_lock_init(&css->css_lock); + + return 0; +} + +/* Connect the stats object to debugfs. */ +void +xchk_stats_register( + struct xchk_stats *cs, + struct dentry *parent) +{ + if (!parent) + return; + + cs->cs_debugfs = xfs_debugfs_mkdir("scrub", parent); + if (!cs->cs_debugfs) + return; + + debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, + &scrub_stats_fops); + debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, + &clear_scrub_stats_fops); +} + +/* Free all resources related to the stats object. */ +STATIC int +xchk_stats_teardown( + struct xchk_stats *cs) +{ + return 0; +} + +/* Disconnect the stats object from debugfs. */ +void +xchk_stats_unregister( + struct xchk_stats *cs) +{ + debugfs_remove(cs->cs_debugfs); +} + +/* Initialize global stats and register them */ +int __init +xchk_global_stats_setup( + struct dentry *parent) +{ + int error; + + error = xchk_stats_init(&global_stats, NULL); + if (error) + return error; + + xchk_stats_register(&global_stats, parent); + return 0; +} + +/* Unregister global stats and tear them down */ +void +xchk_global_stats_teardown(void) +{ + xchk_stats_unregister(&global_stats); + xchk_stats_teardown(&global_stats); +} + +/* Allocate per-mount stats */ +int +xchk_mount_stats_alloc( + struct xfs_mount *mp) +{ + struct xchk_stats *cs; + int error; + + cs = kvzalloc(sizeof(struct xchk_stats), GFP_KERNEL); + if (!cs) + return -ENOMEM; + + error = xchk_stats_init(cs, mp); + if (error) + goto out_free; + + mp->m_scrub_stats = cs; + return 0; +out_free: + kvfree(cs); + return error; +} + +/* Free per-mount stats */ +void +xchk_mount_stats_free( + struct xfs_mount *mp) +{ + xchk_stats_teardown(mp->m_scrub_stats); + kvfree(mp->m_scrub_stats); + mp->m_scrub_stats = NULL; +} diff --git a/fs/xfs/scrub/stats.h b/fs/xfs/scrub/stats.h new file mode 100644 index 000000000000..b358ad8d8b90 --- /dev/null +++ b/fs/xfs/scrub/stats.h @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_STATS_H__ +#define __XFS_SCRUB_STATS_H__ + +struct xchk_stats_run { + u64 scrub_ns; + u64 repair_ns; + unsigned int retries; + bool repair_attempted; + bool repair_succeeded; +}; + +#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS +struct xchk_stats; + +int __init xchk_global_stats_setup(struct dentry *parent); +void xchk_global_stats_teardown(void); + +int xchk_mount_stats_alloc(struct xfs_mount *mp); +void xchk_mount_stats_free(struct xfs_mount *mp); + +void xchk_stats_register(struct xchk_stats *cs, struct dentry *parent); +void xchk_stats_unregister(struct xchk_stats *cs); + +void xchk_stats_merge(struct xfs_mount *mp, const struct xfs_scrub_metadata *sm, + const struct xchk_stats_run *run); + +static inline u64 xchk_stats_now(void) { return ktime_get_ns(); } +static inline u64 xchk_stats_elapsed_ns(u64 since) +{ + u64 now = xchk_stats_now(); + + /* + * If the system doesn't have a high enough resolution clock, charge at + * least one nanosecond so that our stats don't report instantaneous + * runtimes. + */ + if (now == since) + return 1; + + return now - since; +} +#else +# define xchk_global_stats_setup(parent) (0) +# define xchk_global_stats_teardown() ((void)0) +# define xchk_mount_stats_alloc(mp) (0) +# define xchk_mount_stats_free(mp) ((void)0) +# define xchk_stats_register(cs, parent) ((void)0) +# define xchk_stats_unregister(cs) ((void)0) +# define xchk_stats_now() (0) +# define xchk_stats_elapsed_ns(x) (0 * (x)) +# define xchk_stats_merge(mp, sm, run) ((void)0) +#endif /* CONFIG_XFS_ONLINE_SCRUB_STATS */ + +#endif /* __XFS_SCRUB_STATS_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 0a975439d2b6..29afa4851235 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -12,8 +12,11 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "scrub/scrub.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "scrub/scrub.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b3894daeb86a..4a8bc6f3c8f2 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -16,6 +16,10 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +struct xfile; +struct xfarray; +struct xfarray_sortinfo; + /* * ftrace's __print_symbolic requires that all enum values be wrapped in the * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace @@ -94,10 +98,12 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \ { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \ { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ - { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } + { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }, \ + { XFS_SCRUB_IFLAG_FORCE_REBUILD, "rebuild" } #define XFS_SCRUB_STATE_STRINGS \ { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ { XREP_ALREADY_FIXED, "already_fixed" } @@ -635,6 +641,28 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __entry->cluster_ino) ) +TRACE_EVENT(xchk_inode_is_allocated, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned long, iflags) + __field(umode_t, mode) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; + __entry->mode = VFS_I(ip)->i_mode; + ), + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx mode 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->iflags, + __entry->mode) +); + TRACE_EVENT(xchk_fscounters_calc, TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree, uint64_t fdblocks, uint64_t delalloc), @@ -693,6 +721,31 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +DECLARE_EVENT_CLASS(xchk_fsfreeze_class, + TP_PROTO(struct xfs_scrub *sc, int error), + TP_ARGS(sc, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + ), + TP_printk("dev %d:%d type %s error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->error) +); +#define DEFINE_XCHK_FSFREEZE_EVENT(name) \ +DEFINE_EVENT(xchk_fsfreeze_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int error), \ + TP_ARGS(sc, error)) +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); + TRACE_EVENT(xchk_refcount_incorrect, TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, xfs_nlink_t seen), @@ -725,13 +778,303 @@ TRACE_EVENT(xchk_refcount_incorrect, __entry->seen) ) +TRACE_EVENT(xfile_create, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __array(char, pathname, 256) + ), + TP_fast_assign( + char pathname[257]; + char *path; + + __entry->ino = file_inode(xf->file)->i_ino; + memset(pathname, 0, sizeof(pathname)); + path = file_path(xf->file, pathname, sizeof(pathname) - 1); + if (IS_ERR(path)) + path = "(unknown)"; + strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + ), + TP_printk("xfino 0x%lx path '%s'", + __entry->ino, + __entry->pathname) +); + +TRACE_EVENT(xfile_destroy, + TP_PROTO(struct xfile *xf), + TP_ARGS(xf), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes) + __field(loff_t, size) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes, + __entry->size) +); + +DECLARE_EVENT_CLASS(xfile_class, + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), + TP_ARGS(xf, pos, bytecount), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes_used) + __field(loff_t, pos) + __field(loff_t, size) + __field(unsigned long long, bytecount) + ), + TP_fast_assign( + struct xfile_stat statbuf; + int ret; + + ret = xfile_stat(xf, &statbuf); + if (!ret) { + __entry->bytes_used = statbuf.bytes; + __entry->size = statbuf.size; + } else { + __entry->bytes_used = -1; + __entry->size = -1; + } + __entry->ino = file_inode(xf->file)->i_ino; + __entry->pos = pos; + __entry->bytecount = bytecount; + ), + TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes_used, + __entry->pos, + __entry->bytecount, + __entry->size) +); +#define DEFINE_XFILE_EVENT(name) \ +DEFINE_EVENT(xfile_class, name, \ + TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ + TP_ARGS(xf, pos, bytecount)) +DEFINE_XFILE_EVENT(xfile_pread); +DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_seek_data); +DEFINE_XFILE_EVENT(xfile_get_page); +DEFINE_XFILE_EVENT(xfile_put_page); + +TRACE_EVENT(xfarray_create, + TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), + TP_ARGS(xfa, required_capacity), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(uint64_t, max_nr) + __field(size_t, obj_size) + __field(int, obj_size_log) + __field(unsigned long long, required_capacity) + ), + TP_fast_assign( + __entry->max_nr = xfa->max_nr; + __entry->obj_size = xfa->obj_size; + __entry->obj_size_log = xfa->obj_size_log; + __entry->ino = file_inode(xfa->xfile->file)->i_ino; + __entry->required_capacity = required_capacity; + ), + TP_printk("xfino 0x%lx max_nr %llu reqd_nr %llu objsz %zu objszlog %d", + __entry->ino, + __entry->max_nr, + __entry->required_capacity, + __entry->obj_size, + __entry->obj_size_log) +); + +TRACE_EVENT(xfarray_isort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_pagesort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo) +); + +TRACE_EVENT(xfarray_qsort, + TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), + TP_ARGS(si, lo, hi), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, lo) + __field(unsigned long long, hi) + __field(int, stack_depth) + __field(int, max_stack_depth) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->lo = lo; + __entry->hi = hi; + __entry->stack_depth = si->stack_depth; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d", + __entry->ino, + __entry->lo, + __entry->hi, + __entry->hi - __entry->lo, + __entry->stack_depth, + __entry->max_stack_depth) +); + +TRACE_EVENT(xfarray_sort, + TP_PROTO(struct xfarray_sortinfo *si, size_t bytes), + TP_ARGS(si, bytes), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(size_t, bytes) + __field(unsigned int, max_stack_depth) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->bytes = bytes; + __entry->max_stack_depth = si->max_stack_depth; + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->max_stack_depth, + __entry->bytes) +); + +TRACE_EVENT(xfarray_sort_stats, + TP_PROTO(struct xfarray_sortinfo *si, int error), + TP_ARGS(si, error), + TP_STRUCT__entry( + __field(unsigned long, ino) +#ifdef DEBUG + __field(unsigned long long, loads) + __field(unsigned long long, stores) + __field(unsigned long long, compares) + __field(unsigned long long, heapsorts) +#endif + __field(unsigned int, max_stack_depth) + __field(unsigned int, max_stack_used) + __field(int, error) + ), + TP_fast_assign( + __entry->ino = file_inode(si->array->xfile->file)->i_ino; +#ifdef DEBUG + __entry->loads = si->loads; + __entry->stores = si->stores; + __entry->compares = si->compares; + __entry->heapsorts = si->heapsorts; +#endif + __entry->max_stack_depth = si->max_stack_depth; + __entry->max_stack_used = si->max_stack_used; + __entry->error = error; + ), + TP_printk( +#ifdef DEBUG + "xfino 0x%lx loads %llu stores %llu compares %llu heapsorts %llu stack_depth %u/%u error %d", +#else + "xfino 0x%lx stack_depth %u/%u error %d", +#endif + __entry->ino, +#ifdef DEBUG + __entry->loads, + __entry->stores, + __entry->compares, + __entry->heapsorts, +#endif + __entry->max_stack_used, + __entry->max_stack_depth, + __entry->error) +); + +#ifdef CONFIG_XFS_RT +TRACE_EVENT(xchk_rtsum_record_free, + TP_PROTO(struct xfs_mount *mp, xfs_rtxnum_t start, + xfs_rtbxlen_t len, unsigned int log, loff_t pos, + xfs_suminfo_t value), + TP_ARGS(mp, start, len, log, pos, value), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, rtdev) + __field(xfs_rtxnum_t, start) + __field(unsigned long long, len) + __field(unsigned int, log) + __field(loff_t, pos) + __field(xfs_suminfo_t, value) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rtdev = mp->m_rtdev_targp->bt_dev; + __entry->start = start; + __entry->len = len; + __entry->log = log; + __entry->pos = pos; + __entry->value = value; + ), + TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->rtdev), MINOR(__entry->rtdev), + __entry->start, + __entry->len, + __entry->log, + __entry->pos, + __entry->value) +); +#endif /* CONFIG_XFS_RT */ + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) DECLARE_EVENT_CLASS(xrep_extent_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len), - TP_ARGS(mp, agno, agbno, len), + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(pag, agbno, len), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -739,8 +1082,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class, __field(xfs_extlen_t, len) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; __entry->agbno = agbno; __entry->len = len; ), @@ -752,12 +1095,45 @@ DECLARE_EVENT_CLASS(xrep_extent_class, ); #define DEFINE_REPAIR_EXTENT_EVENT(name) \ DEFINE_EVENT(xrep_extent_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - xfs_agblock_t agbno, xfs_extlen_t len), \ - TP_ARGS(mp, agno, agbno, len)) -DEFINE_REPAIR_EXTENT_EVENT(xrep_dispose_btree_extent); + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(pag, agbno, len)) +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent); +DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_find_class, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, + bool crosslinked), + TP_ARGS(pag, agbno, len, crosslinked), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(bool, crosslinked) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->crosslinked = crosslinked; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->crosslinked ? 1 : 0) +); +#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \ +DEFINE_EVENT(xrep_reap_find_class, name, \ + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \ + bool crosslinked), \ + TP_ARGS(pag, agbno, len, crosslinked)) +DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select); + DECLARE_EVENT_CLASS(xrep_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, @@ -827,28 +1203,6 @@ TRACE_EVENT(xrep_refcount_extent_fn, __entry->refcount) ) -TRACE_EVENT(xrep_init_btblock, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_btnum_t btnum), - TP_ARGS(mp, agno, agbno, btnum), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) - __field(uint32_t, btnum) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->btnum = btnum; - ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, - __entry->agbno, - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS)) -) TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c new file mode 100644 index 000000000000..f0f532c10a5a --- /dev/null +++ b/fs/xfs/scrub/xfarray.c @@ -0,0 +1,1083 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" + +/* + * Large Arrays of Fixed-Size Records + * ================================== + * + * This memory array uses an xfile (which itself is a memfd "file") to store + * large numbers of fixed-size records in memory that can be paged out. This + * puts less stress on the memory reclaim algorithms during an online repair + * because we don't have to pin so much memory. However, array access is less + * direct than would be in a regular memory array. Access to the array is + * performed via indexed load and store methods, and an append method is + * provided for convenience. Array elements can be unset, which sets them to + * all zeroes. Unset entries are skipped during iteration, though direct loads + * will return a zeroed buffer. Callers are responsible for concurrency + * control. + */ + +/* + * Pointer to scratch space. Because we can't access the xfile data directly, + * we allocate a small amount of memory on the end of the xfarray structure to + * buffer array items when we need space to store values temporarily. + */ +static inline void *xfarray_scratch(struct xfarray *array) +{ + return (array + 1); +} + +/* Compute array index given an xfile offset. */ +static xfarray_idx_t +xfarray_idx( + struct xfarray *array, + loff_t pos) +{ + if (array->obj_size_log >= 0) + return (xfarray_idx_t)pos >> array->obj_size_log; + + return div_u64((xfarray_idx_t)pos, array->obj_size); +} + +/* Compute xfile offset of array element. */ +static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx) +{ + if (array->obj_size_log >= 0) + return idx << array->obj_size_log; + + return idx * array->obj_size; +} + +/* + * Initialize a big memory array. Array records cannot be larger than a + * page, and the array cannot span more bytes than the page cache supports. + * If @required_capacity is nonzero, the maximum array size will be set to this + * quantity and the array creation will fail if the underlying storage cannot + * support that many records. + */ +int +xfarray_create( + const char *description, + unsigned long long required_capacity, + size_t obj_size, + struct xfarray **arrayp) +{ + struct xfarray *array; + struct xfile *xfile; + int error; + + ASSERT(obj_size < PAGE_SIZE); + + error = xfile_create(description, 0, &xfile); + if (error) + return error; + + error = -ENOMEM; + array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS); + if (!array) + goto out_xfile; + + array->xfile = xfile; + array->obj_size = obj_size; + + if (is_power_of_2(obj_size)) + array->obj_size_log = ilog2(obj_size); + else + array->obj_size_log = -1; + + array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE); + trace_xfarray_create(array, required_capacity); + + if (required_capacity > 0) { + if (array->max_nr < required_capacity) { + error = -ENOMEM; + goto out_xfarray; + } + array->max_nr = required_capacity; + } + + *arrayp = array; + return 0; + +out_xfarray: + kfree(array); +out_xfile: + xfile_destroy(xfile); + return error; +} + +/* Destroy the array. */ +void +xfarray_destroy( + struct xfarray *array) +{ + xfile_destroy(array->xfile); + kfree(array); +} + +/* Load an element from the array. */ +int +xfarray_load( + struct xfarray *array, + xfarray_idx_t idx, + void *ptr) +{ + if (idx >= array->nr) + return -ENODATA; + + return xfile_obj_load(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); +} + +/* Is this array element potentially unset? */ +static inline bool +xfarray_is_unset( + struct xfarray *array, + loff_t pos) +{ + void *temp = xfarray_scratch(array); + int error; + + if (array->unset_slots == 0) + return false; + + error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + if (!error && xfarray_element_is_null(array, temp)) + return true; + + return false; +} + +/* + * Unset an array element. If @idx is the last element in the array, the + * array will be truncated. Otherwise, the entry will be zeroed. + */ +int +xfarray_unset( + struct xfarray *array, + xfarray_idx_t idx) +{ + void *temp = xfarray_scratch(array); + loff_t pos = xfarray_pos(array, idx); + int error; + + if (idx >= array->nr) + return -ENODATA; + + if (idx == array->nr - 1) { + array->nr--; + return 0; + } + + if (xfarray_is_unset(array, pos)) + return 0; + + memset(temp, 0, array->obj_size); + error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + if (error) + return error; + + array->unset_slots++; + return 0; +} + +/* + * Store an element in the array. The element must not be completely zeroed, + * because those are considered unset sparse elements. + */ +int +xfarray_store( + struct xfarray *array, + xfarray_idx_t idx, + const void *ptr) +{ + int ret; + + if (idx >= array->max_nr) + return -EFBIG; + + ASSERT(!xfarray_element_is_null(array, ptr)); + + ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + xfarray_pos(array, idx)); + if (ret) + return ret; + + array->nr = max(array->nr, idx + 1); + return 0; +} + +/* Is this array element NULL? */ +bool +xfarray_element_is_null( + struct xfarray *array, + const void *ptr) +{ + return !memchr_inv(ptr, 0, array->obj_size); +} + +/* + * Store an element anywhere in the array that is unset. If there are no + * unset slots, append the element to the array. + */ +int +xfarray_store_anywhere( + struct xfarray *array, + const void *ptr) +{ + void *temp = xfarray_scratch(array); + loff_t endpos = xfarray_pos(array, array->nr); + loff_t pos; + int error; + + /* Find an unset slot to put it in. */ + for (pos = 0; + pos < endpos && array->unset_slots > 0; + pos += array->obj_size) { + error = xfile_obj_load(array->xfile, temp, array->obj_size, + pos); + if (error || !xfarray_element_is_null(array, temp)) + continue; + + error = xfile_obj_store(array->xfile, ptr, array->obj_size, + pos); + if (error) + return error; + + array->unset_slots--; + return 0; + } + + /* No unset slots found; attach it on the end. */ + array->unset_slots = 0; + return xfarray_append(array, ptr); +} + +/* Return length of array. */ +uint64_t +xfarray_length( + struct xfarray *array) +{ + return array->nr; +} + +/* + * Decide which array item we're going to read as part of an _iter_get. + * @cur is the array index, and @pos is the file offset of that array index in + * the backing xfile. Returns ENODATA if we reach the end of the records. + * + * Reading from a hole in a sparse xfile causes page instantiation, so for + * iterating a (possibly sparse) array we need to figure out if the cursor is + * pointing at a totally uninitialized hole and move the cursor up if + * necessary. + */ +static inline int +xfarray_find_data( + struct xfarray *array, + xfarray_idx_t *cur, + loff_t *pos) +{ + unsigned int pgoff = offset_in_page(*pos); + loff_t end_pos = *pos + array->obj_size - 1; + loff_t new_pos; + + /* + * If the current array record is not adjacent to a page boundary, we + * are in the middle of the page. We do not need to move the cursor. + */ + if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE) + return 0; + + /* + * Call SEEK_DATA on the last byte in the record we're about to read. + * If the record ends at (or crosses) the end of a page then we know + * that the first byte of the record is backed by pages and don't need + * to query it. If instead the record begins at the start of the page + * then we know that querying the last byte is just as good as querying + * the first byte, since records cannot be larger than a page. + * + * If the call returns the same file offset, we know this record is + * backed by real pages. We do not need to move the cursor. + */ + new_pos = xfile_seek_data(array->xfile, end_pos); + if (new_pos == -ENXIO) + return -ENODATA; + if (new_pos < 0) + return new_pos; + if (new_pos == end_pos) + return 0; + + /* + * Otherwise, SEEK_DATA told us how far up to move the file pointer to + * find more data. Move the array index to the first record past the + * byte offset we were given. + */ + new_pos = roundup_64(new_pos, array->obj_size); + *cur = xfarray_idx(array, new_pos); + *pos = xfarray_pos(array, *cur); + return 0; +} + +/* + * Starting at *idx, fetch the next non-null array entry and advance the index + * to set up the next _load_next call. Returns ENODATA if we reach the end of + * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first + * call to this function. + */ +int +xfarray_load_next( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + xfarray_idx_t cur = *idx; + loff_t pos = xfarray_pos(array, cur); + int error; + + do { + if (cur >= array->nr) + return -ENODATA; + + /* + * Ask the backing store for the location of next possible + * written record, then retrieve that record. + */ + error = xfarray_find_data(array, &cur, &pos); + if (error) + return error; + error = xfarray_load(array, cur, rec); + if (error) + return error; + + cur++; + pos += array->obj_size; + } while (xfarray_element_is_null(array, rec)); + + *idx = cur; + return 0; +} + +/* Sorting functions */ + +#ifdef DEBUG +# define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0) +# define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0) +# define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0) +# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0) +#else +# define xfarray_sort_bump_loads(si) +# define xfarray_sort_bump_stores(si) +# define xfarray_sort_bump_compares(si) +# define xfarray_sort_bump_heapsorts(si) +#endif /* DEBUG */ + +/* Load an array element for sorting. */ +static inline int +xfarray_sort_load( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_loads(si); + return xfarray_load(si->array, idx, ptr); +} + +/* Store an array element for sorting. */ +static inline int +xfarray_sort_store( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + xfarray_sort_bump_stores(si); + return xfarray_store(si->array, idx, ptr); +} + +/* Compare an array element for sorting. */ +static inline int +xfarray_sort_cmp( + struct xfarray_sortinfo *si, + const void *a, + const void *b) +{ + xfarray_sort_bump_compares(si); + return si->cmp_fn(a, b); +} + +/* Return a pointer to the low index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si) +{ + return (xfarray_idx_t *)(si + 1); +} + +/* Return a pointer to the high index stack for quicksort partitioning. */ +static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_lo(si) + si->max_stack_depth; +} + +/* Size of each element in the quicksort pivot array. */ +static inline size_t +xfarray_pivot_rec_sz( + struct xfarray *array) +{ + return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t); +} + +/* Allocate memory to handle the sort. */ +static inline int +xfarray_sortinfo_alloc( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags, + struct xfarray_sortinfo **infop) +{ + struct xfarray_sortinfo *si; + size_t nr_bytes = sizeof(struct xfarray_sortinfo); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(array); + int max_stack_depth; + + /* + * The median-of-nine pivot algorithm doesn't work if a subset has + * fewer than 9 items. Make sure the in-memory sort will always take + * over for subsets where this wouldn't be the case. + */ + BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR); + + /* + * Tail-call recursion during the partitioning phase means that + * quicksort will never recurse more than log2(nr) times. We need one + * extra level of stack to hold the initial parameters. In-memory + * sort will always take care of the last few levels of recursion for + * us, so we can reduce the stack depth by that much. + */ + max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1); + if (max_stack_depth < 1) + max_stack_depth = 1; + + /* Each level of quicksort uses a lo and a hi index */ + nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2; + + /* Scratchpad for in-memory sort, or finding the pivot */ + nr_bytes += max_t(size_t, + (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz, + XFARRAY_ISORT_NR * array->obj_size); + + si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS); + if (!si) + return -ENOMEM; + + si->array = array; + si->cmp_fn = cmp_fn; + si->flags = flags; + si->max_stack_depth = max_stack_depth; + si->max_stack_used = 1; + + xfarray_sortinfo_lo(si)[0] = 0; + xfarray_sortinfo_hi(si)[0] = array->nr - 1; + + trace_xfarray_sort(si, nr_bytes); + *infop = si; + return 0; +} + +/* Should this sort be terminated by a fatal signal? */ +static inline bool +xfarray_sort_terminated( + struct xfarray_sortinfo *si, + int *error) +{ + /* + * If preemption is disabled, we need to yield to the scheduler every + * few seconds so that we don't run afoul of the soft lockup watchdog + * or RCU stall detector. + */ + cond_resched(); + + if ((si->flags & XFARRAY_SORT_KILLABLE) && + fatal_signal_pending(current)) { + if (*error == 0) + *error = -EINTR; + return true; + } + return false; +} + +/* Do we want an in-memory sort? */ +static inline bool +xfarray_want_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t start, + xfarray_idx_t end) +{ + /* + * For array subsets that fit in the scratchpad, it's much faster to + * use the kernel's heapsort than quicksort's stack machine. + */ + return (end - start) < XFARRAY_ISORT_NR; +} + +/* Return the scratch space within the sortinfo structure. */ +static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* + * Sort a small number of array records using scratchpad memory. The records + * need not be contiguous in the xfile's memory pages. + */ +STATIC int +xfarray_isort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *scratch = xfarray_sortinfo_isort_scratch(si); + loff_t lo_pos = xfarray_pos(si->array, lo); + loff_t len = xfarray_pos(si->array, hi - lo + 1); + int error; + + trace_xfarray_isort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); +} + +/* Grab a page for sorting records. */ +static inline int +xfarray_sort_get_page( + struct xfarray_sortinfo *si, + loff_t pos, + uint64_t len) +{ + int error; + + error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); + if (error) + return error; + + /* + * xfile pages must never be mapped into userspace, so we skip the + * dcache flush when mapping the page. + */ + si->page_kaddr = kmap_local_page(si->xfpage.page); + return 0; +} + +/* Release a page we grabbed for sorting records. */ +static inline int +xfarray_sort_put_page( + struct xfarray_sortinfo *si) +{ + if (!si->page_kaddr) + return 0; + + kunmap_local(si->page_kaddr); + si->page_kaddr = NULL; + + return xfile_put_page(si->array->xfile, &si->xfpage); +} + +/* Decide if these records are eligible for in-page sorting. */ +static inline bool +xfarray_want_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + pgoff_t lo_page; + pgoff_t hi_page; + loff_t end_pos; + + /* We can only map one page at a time. */ + lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; + end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; + hi_page = end_pos >> PAGE_SHIFT; + + return lo_page == hi_page; +} + +/* Sort a bunch of records that all live in the same memory page. */ +STATIC int +xfarray_pagesort( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *startp; + loff_t lo_pos = xfarray_pos(si->array, lo); + uint64_t len = xfarray_pos(si->array, hi - lo); + int error = 0; + + trace_xfarray_pagesort(si, lo, hi); + + xfarray_sort_bump_loads(si); + error = xfarray_sort_get_page(si, lo_pos, len); + if (error) + return error; + + xfarray_sort_bump_heapsorts(si); + startp = si->page_kaddr + offset_in_page(lo_pos); + sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); + + xfarray_sort_bump_stores(si); + return xfarray_sort_put_page(si); +} + +/* Return a pointer to the xfarray pivot record within the sortinfo struct. */ +static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_hi(si) + si->max_stack_depth; +} + +/* Return a pointer to the start of the pivot array. */ +static inline void * +xfarray_sortinfo_pivot_array( + struct xfarray_sortinfo *si) +{ + return xfarray_sortinfo_pivot(si) + si->array->obj_size; +} + +/* The xfarray record is stored at the start of each pivot array element. */ +static inline void * +xfarray_pivot_array_rec( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return pa + (pa_recsz * pa_idx); +} + +/* The xfarray index is stored at the end of each pivot array element. */ +static inline xfarray_idx_t * +xfarray_pivot_array_idx( + void *pa, + size_t pa_recsz, + unsigned int pa_idx) +{ + return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) - + sizeof(xfarray_idx_t); +} + +/* + * Find a pivot value for quicksort partitioning, swap it with a[lo], and save + * the cached pivot record for the next step. + * + * Load evenly-spaced records within the given range into memory, sort them, + * and choose the pivot from the median record. Using multiple points will + * improve the quality of the pivot selection, and hopefully avoid the worst + * quicksort behavior, since our array values are nearly always evenly sorted. + */ +STATIC int +xfarray_qsort_pivot( + struct xfarray_sortinfo *si, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + void *pivot = xfarray_sortinfo_pivot(si); + void *parray = xfarray_sortinfo_pivot_array(si); + void *recp; + xfarray_idx_t *idxp; + xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1); + size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array); + int i, j; + int error; + + ASSERT(step > 0); + + /* + * Load the xfarray indexes of the records we intend to sample into the + * pivot array. + */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0); + *idxp = lo; + for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + *idxp = lo + (i * step); + } + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR - 1); + *idxp = hi; + + /* Load the selected xfarray records into the pivot array. */ + for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) { + xfarray_idx_t idx; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + + /* No unset records; load directly into the array. */ + if (likely(si->array->unset_slots == 0)) { + error = xfarray_sort_load(si, *idxp, recp); + if (error) + return error; + continue; + } + + /* + * Load non-null records into the scratchpad without changing + * the xfarray_idx_t in the pivot array. + */ + idx = *idxp; + xfarray_sort_bump_loads(si); + error = xfarray_load_next(si->array, &idx, recp); + if (error) + return error; + } + + xfarray_sort_bump_heapsorts(si); + sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL); + + /* + * We sorted the pivot array records (which includes the xfarray + * indices) in xfarray record order. The median element of the pivot + * array contains the xfarray record that we will use as the pivot. + * Copy that xfarray record to the designated space. + */ + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + memcpy(pivot, recp, si->array->obj_size); + + /* If the pivot record we chose was already in a[lo] then we're done. */ + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + if (*idxp == lo) + return 0; + + /* + * Find the cached copy of a[lo] in the pivot array so that we can swap + * a[lo] and a[pivot]. + */ + for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) { + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i); + if (*idxp == lo) + j = i; + } + if (j < 0) { + ASSERT(j >= 0); + return -EFSCORRUPTED; + } + + /* Swap a[lo] and a[pivot]. */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + return error; + + recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j); + idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, + XFARRAY_QSORT_PIVOT_NR / 2); + return xfarray_sort_store(si, *idxp, recp); +} + +/* + * Set up the pointers for the next iteration. We push onto the stack all of + * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the + * current stack frame to point to the unsorted values between a[beg[i]] and + * a[lo] so that those values will be sorted when we pop the stack. + */ +static inline int +xfarray_qsort_push( + struct xfarray_sortinfo *si, + xfarray_idx_t *si_lo, + xfarray_idx_t *si_hi, + xfarray_idx_t lo, + xfarray_idx_t hi) +{ + /* Check for stack overflows */ + if (si->stack_depth >= si->max_stack_depth - 1) { + ASSERT(si->stack_depth < si->max_stack_depth - 1); + return -EFSCORRUPTED; + } + + si->max_stack_used = max_t(uint8_t, si->max_stack_used, + si->stack_depth + 2); + + si_lo[si->stack_depth + 1] = lo + 1; + si_hi[si->stack_depth + 1] = si_hi[si->stack_depth]; + si_hi[si->stack_depth++] = lo - 1; + + /* + * Always start with the smaller of the two partitions to keep the + * amount of recursion in check. + */ + if (si_hi[si->stack_depth] - si_lo[si->stack_depth] > + si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) { + swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]); + swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]); + } + + return 0; +} + +/* + * Load an element from the array into the first scratchpad and cache the page, + * if possible. + */ +static inline int +xfarray_sort_load_cached( + struct xfarray_sortinfo *si, + xfarray_idx_t idx, + void *ptr) +{ + loff_t idx_pos = xfarray_pos(si->array, idx); + pgoff_t startpage; + pgoff_t endpage; + int error = 0; + + /* + * If this load would split a page, release the cached page, if any, + * and perform a traditional read. + */ + startpage = idx_pos >> PAGE_SHIFT; + endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; + if (startpage != endpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + + if (xfarray_sort_terminated(si, &error)) + return error; + + return xfile_obj_load(si->array->xfile, ptr, + si->array->obj_size, idx_pos); + } + + /* If the cached page is not the one we want, release it. */ + if (xfile_page_cached(&si->xfpage) && + xfile_page_index(&si->xfpage) != startpage) { + error = xfarray_sort_put_page(si); + if (error) + return error; + } + + /* + * If we don't have a cached page (and we know the load is contained + * in a single page) then grab it. + */ + if (!xfile_page_cached(&si->xfpage)) { + if (xfarray_sort_terminated(si, &error)) + return error; + + error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, + PAGE_SIZE); + if (error) + return error; + } + + memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), + si->array->obj_size); + return 0; +} + +/* + * Sort the array elements via quicksort. This implementation incorporates + * four optimizations discussed in Sedgewick: + * + * 1. Use an explicit stack of array indices to store the next array partition + * to sort. This helps us to avoid recursion in the call stack, which is + * particularly expensive in the kernel. + * + * 2. For arrays with records in arbitrary or user-controlled order, choose the + * pivot element using a median-of-nine decision tree. This reduces the + * probability of selecting a bad pivot value which causes worst case + * behavior (i.e. partition sizes of 1). + * + * 3. The smaller of the two sub-partitions is pushed onto the stack to start + * the next level of recursion, and the larger sub-partition replaces the + * current stack frame. This guarantees that we won't need more than + * log2(nr) stack space. + * + * 4. For small sets, load the records into the scratchpad and run heapsort on + * them because that is very fast. In the author's experience, this yields + * a ~10% reduction in runtime. + * + * If a small set is contained entirely within a single xfile memory page, + * map the page directly and run heap sort directly on the xfile page + * instead of using the load/store interface. This halves the runtime. + * + * 5. This optimization is specific to the implementation. When converging lo + * and hi after selecting a pivot, we will try to retain the xfile memory + * page between load calls, which reduces run time by 50%. + */ + +/* + * Due to the use of signed indices, we can only support up to 2^63 records. + * Files can only grow to 2^63 bytes, so this is not much of a limitation. + */ +#define QSORT_MAX_RECS (1ULL << 63) + +int +xfarray_sort( + struct xfarray *array, + xfarray_cmp_fn cmp_fn, + unsigned int flags) +{ + struct xfarray_sortinfo *si; + xfarray_idx_t *si_lo, *si_hi; + void *pivot; + void *scratch = xfarray_scratch(array); + xfarray_idx_t lo, hi; + int error = 0; + + if (array->nr < 2) + return 0; + if (array->nr >= QSORT_MAX_RECS) + return -E2BIG; + + error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si); + if (error) + return error; + si_lo = xfarray_sortinfo_lo(si); + si_hi = xfarray_sortinfo_hi(si); + pivot = xfarray_sortinfo_pivot(si); + + while (si->stack_depth >= 0) { + lo = si_lo[si->stack_depth]; + hi = si_hi[si->stack_depth]; + + trace_xfarray_qsort(si, lo, hi); + + /* Nothing left in this partition to sort; pop stack. */ + if (lo >= hi) { + si->stack_depth--; + continue; + } + + /* + * If directly mapping the page and sorting can solve our + * problems, we're done. + */ + if (xfarray_want_pagesort(si, lo, hi)) { + error = xfarray_pagesort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* If insertion sort can solve our problems, we're done. */ + if (xfarray_want_isort(si, lo, hi)) { + error = xfarray_isort(si, lo, hi); + if (error) + goto out_free; + si->stack_depth--; + continue; + } + + /* Pick a pivot, move it to a[lo] and stash it. */ + error = xfarray_qsort_pivot(si, lo, hi); + if (error) + goto out_free; + + /* + * Rearrange a[lo..hi] such that everything smaller than the + * pivot is on the left side of the range and everything larger + * than the pivot is on the right side of the range. + */ + while (lo < hi) { + /* + * Decrement hi until it finds an a[hi] less than the + * pivot value. + */ + error = xfarray_sort_load_cached(si, hi, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && + lo < hi) { + hi--; + error = xfarray_sort_load_cached(si, hi, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[hi]) to a[lo]. */ + if (lo < hi) { + error = xfarray_sort_store(si, lo++, scratch); + if (error) + goto out_free; + } + + /* + * Increment lo until it finds an a[lo] greater than + * the pivot value. + */ + error = xfarray_sort_load_cached(si, lo, scratch); + if (error) + goto out_free; + while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && + lo < hi) { + lo++; + error = xfarray_sort_load_cached(si, lo, + scratch); + if (error) + goto out_free; + } + error = xfarray_sort_put_page(si); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + + /* Copy that item (a[lo]) to a[hi]. */ + if (lo < hi) { + error = xfarray_sort_store(si, hi--, scratch); + if (error) + goto out_free; + } + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + + /* + * Put our pivot value in the correct place at a[lo]. All + * values between a[beg[i]] and a[lo - 1] should be less than + * the pivot; and all values between a[lo + 1] and a[end[i]-1] + * should be greater than the pivot. + */ + error = xfarray_sort_store(si, lo, pivot); + if (error) + goto out_free; + + /* Set up the stack frame to process the two partitions. */ + error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi); + if (error) + goto out_free; + + if (xfarray_sort_terminated(si, &error)) + goto out_free; + } + +out_free: + trace_xfarray_sort_stats(si, error); + kvfree(si); + return error; +} diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h new file mode 100644 index 000000000000..4ecac01363d9 --- /dev/null +++ b/fs/xfs/scrub/xfarray.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFARRAY_H__ +#define __XFS_SCRUB_XFARRAY_H__ + +/* xfile array index type, along with cursor initialization */ +typedef uint64_t xfarray_idx_t; +#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0) + +/* Iterate each index of an xfile array. */ +#define foreach_xfarray_idx(array, idx) \ + for ((idx) = XFARRAY_CURSOR_INIT; \ + (idx) < xfarray_length(array); \ + (idx)++) + +struct xfarray { + /* Underlying file that backs the array. */ + struct xfile *xfile; + + /* Number of array elements. */ + xfarray_idx_t nr; + + /* Maximum possible array size. */ + xfarray_idx_t max_nr; + + /* Number of unset slots in the array below @nr. */ + uint64_t unset_slots; + + /* Size of an array element. */ + size_t obj_size; + + /* log2 of array element size, if possible. */ + int obj_size_log; +}; + +int xfarray_create(const char *descr, unsigned long long required_capacity, + size_t obj_size, struct xfarray **arrayp); +void xfarray_destroy(struct xfarray *array); +int xfarray_load(struct xfarray *array, xfarray_idx_t idx, void *ptr); +int xfarray_unset(struct xfarray *array, xfarray_idx_t idx); +int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); +int xfarray_store_anywhere(struct xfarray *array, const void *ptr); +bool xfarray_element_is_null(struct xfarray *array, const void *ptr); + +/* Append an element to the array. */ +static inline int xfarray_append(struct xfarray *array, const void *ptr) +{ + return xfarray_store(array, array->nr, ptr); +} + +uint64_t xfarray_length(struct xfarray *array); +int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); + +/* Declarations for xfile array sort functionality. */ + +typedef cmp_func_t xfarray_cmp_fn; + +/* Perform an in-memory heapsort for small subsets. */ +#define XFARRAY_ISORT_SHIFT (4) +#define XFARRAY_ISORT_NR (1U << XFARRAY_ISORT_SHIFT) + +/* Evalulate this many points to find the qsort pivot. */ +#define XFARRAY_QSORT_PIVOT_NR (9) + +struct xfarray_sortinfo { + struct xfarray *array; + + /* Comparison function for the sort. */ + xfarray_cmp_fn cmp_fn; + + /* Maximum height of the partition stack. */ + uint8_t max_stack_depth; + + /* Current height of the partition stack. */ + int8_t stack_depth; + + /* Maximum stack depth ever used. */ + uint8_t max_stack_used; + + /* XFARRAY_SORT_* flags; see below. */ + unsigned int flags; + + /* Cache a page here for faster access. */ + struct xfile_page xfpage; + void *page_kaddr; + +#ifdef DEBUG + /* Performance statistics. */ + uint64_t loads; + uint64_t stores; + uint64_t compares; + uint64_t heapsorts; +#endif + /* + * Extra bytes are allocated beyond the end of the structure to store + * quicksort information. C does not permit multiple VLAs per struct, + * so we document all of this in a comment. + * + * Pretend that we have a typedef for array records: + * + * typedef char[array->obj_size] xfarray_rec_t; + * + * First comes the quicksort partition stack: + * + * xfarray_idx_t lo[max_stack_depth]; + * xfarray_idx_t hi[max_stack_depth]; + * + * union { + * + * If for a given subset we decide to use an in-memory sort, we use a + * block of scratchpad records here to compare items: + * + * xfarray_rec_t scratch[ISORT_NR]; + * + * Otherwise, we want to partition the records to partition the array. + * We store the chosen pivot record at the start of the scratchpad area + * and use the rest to sample some records to estimate the median. + * The format of the qsort_pivot array enables us to use the kernel + * heapsort function to place the median value in the middle. + * + * struct { + * xfarray_rec_t pivot; + * struct { + * xfarray_rec_t rec; (rounded up to 8 bytes) + * xfarray_idx_t idx; + * } qsort_pivot[QSORT_PIVOT_NR]; + * }; + * } + */ +}; + +/* Sort can be interrupted by a fatal signal. */ +#define XFARRAY_SORT_KILLABLE (1U << 0) + +int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn, + unsigned int flags); + +#endif /* __XFS_SCRUB_XFARRAY_H__ */ diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c new file mode 100644 index 000000000000..090c3ead43fd --- /dev/null +++ b/fs/xfs/scrub/xfile.c @@ -0,0 +1,419 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/scrub.h" +#include "scrub/trace.h" +#include <linux/shmem_fs.h> + +/* + * Swappable Temporary Memory + * ========================== + * + * Online checking sometimes needs to be able to stage a large amount of data + * in memory. This information might not fit in the available memory and it + * doesn't all need to be accessible at all times. In other words, we want an + * indexed data buffer to store data that can be paged out. + * + * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those + * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to + * store our staging data. This file is not installed in the file descriptor + * table so that user programs cannot access the data, which means that the + * xfile must be freed with xfile_destroy. + * + * xfiles assume that the caller will handle all required concurrency + * management; standard vfs locks (freezer and inode) are not taken. Reads + * and writes are satisfied directly from the page cache. + * + * NOTE: The current shmemfs implementation has a quirk that in-kernel reads + * of a hole cause a page to be mapped into the file. If you are going to + * create a sparse xfile, please be careful about reading from uninitialized + * parts of the file. These pages are !Uptodate and will eventually be + * reclaimed if not written, but in the short term this boosts memory + * consumption. + */ + +/* + * xfiles must not be exposed to userspace and require upper layers to + * coordinate access to the one handle returned by the constructor, so + * establish a separate lock class for xfiles to avoid confusing lockdep. + */ +static struct lock_class_key xfile_i_mutex_key; + +/* + * Create an xfile of the given size. The description will be used in the + * trace output. + */ +int +xfile_create( + const char *description, + loff_t isize, + struct xfile **xfilep) +{ + struct inode *inode; + struct xfile *xf; + int error = -ENOMEM; + + xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); + if (!xf) + return -ENOMEM; + + xf->file = shmem_file_setup(description, isize, 0); + if (!xf->file) + goto out_xfile; + if (IS_ERR(xf->file)) { + error = PTR_ERR(xf->file); + goto out_xfile; + } + + /* + * We want a large sparse file that we can pread, pwrite, and seek. + * xfile users are responsible for keeping the xfile hidden away from + * all other callers, so we skip timestamp updates and security checks. + * Make the inode only accessible by root, just in case the xfile ever + * escapes. + */ + xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | + FMODE_LSEEK; + xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; + inode = file_inode(xf->file); + inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; + inode->i_mode &= ~0177; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + + lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + + trace_xfile_create(xf); + + *xfilep = xf; + return 0; +out_xfile: + kfree(xf); + return error; +} + +/* Close the file and release all resources. */ +void +xfile_destroy( + struct xfile *xf) +{ + struct inode *inode = file_inode(xf->file); + + trace_xfile_destroy(xf); + + lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key); + fput(xf->file); + kfree(xf); +} + +/* + * Read a memory object directly from the xfile's page cache. Unlike regular + * pread, we return -E2BIG and -EFBIG for reads that are too large or at too + * high an offset, instead of truncating the read. Otherwise, we return + * bytes read or an error code, like regular pread. + */ +ssize_t +xfile_pread( + struct xfile *xf, + void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + struct page *page = NULL; + ssize_t read = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pread(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *p, *kaddr; + unsigned int len; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * In-kernel reads of a shmem file cause it to allocate a page + * if the mapping shows a hole. Therefore, if we hit ENOMEM + * we can continue by zeroing the caller's buffer. + */ + page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, + __GFP_NOWARN); + if (IS_ERR(page)) { + error = PTR_ERR(page); + if (error != -ENOMEM) + break; + + memset(buf, 0, len); + goto advance; + } + + if (PageUptodate(page)) { + /* + * xfile pages must never be mapped into userspace, so + * we skip the dcache flush. + */ + kaddr = kmap_local_page(page); + p = kaddr + offset_in_page(pos); + memcpy(buf, p, len); + kunmap_local(kaddr); + } else { + memset(buf, 0, len); + } + put_page(page); + +advance: + count -= len; + pos += len; + buf += len; + read += len; + } + memalloc_nofs_restore(pflags); + + if (read > 0) + return read; + return error; +} + +/* + * Write a memory object directly to the xfile's page cache. Unlike regular + * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too + * high an offset, instead of truncating the write. Otherwise, we return + * bytes written or an error code, like regular pwrite. + */ +ssize_t +xfile_pwrite( + struct xfile *xf, + const void *buf, + size_t count, + loff_t pos) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + ssize_t written = 0; + unsigned int pflags; + int error = 0; + + if (count > MAX_RW_COUNT) + return -E2BIG; + if (inode->i_sb->s_maxbytes - pos < count) + return -EFBIG; + + trace_xfile_pwrite(xf, pos, count); + + pflags = memalloc_nofs_save(); + while (count > 0) { + void *fsdata = NULL; + void *p, *kaddr; + unsigned int len; + int ret; + + len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. + * shmem doesn't support fs freeze, but lockdep doesn't know + * that and will trip over that. + */ + error = aops->write_begin(NULL, mapping, pos, len, &page, + &fsdata); + if (error) + break; + + /* + * xfile pages must never be mapped into userspace, so we skip + * the dcache flush. If the page is not uptodate, zero it + * before writing data. + */ + kaddr = kmap_local_page(page); + if (!PageUptodate(page)) { + memset(kaddr, 0, PAGE_SIZE); + SetPageUptodate(page); + } + p = kaddr + offset_in_page(pos); + memcpy(p, buf, len); + kunmap_local(kaddr); + + ret = aops->write_end(NULL, mapping, pos, len, len, page, + fsdata); + if (ret < 0) { + error = ret; + break; + } + + written += ret; + if (ret != len) + break; + + count -= ret; + pos += ret; + buf += ret; + } + memalloc_nofs_restore(pflags); + + if (written > 0) + return written; + return error; +} + +/* Find the next written area in the xfile data for a given offset. */ +loff_t +xfile_seek_data( + struct xfile *xf, + loff_t pos) +{ + loff_t ret; + + ret = vfs_llseek(xf->file, pos, SEEK_DATA); + trace_xfile_seek_data(xf, pos, ret); + return ret; +} + +/* Query stat information for an xfile. */ +int +xfile_stat( + struct xfile *xf, + struct xfile_stat *statbuf) +{ + struct kstat ks; + int error; + + error = vfs_getattr_nosec(&xf->file->f_path, &ks, + STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); + if (error) + return error; + + statbuf->size = ks.size; + statbuf->bytes = ks.blocks << SECTOR_SHIFT; + return 0; +} + +/* + * Grab the (locked) page for a memory object. The object cannot span a page + * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we + * cannot grab the page, or the usual negative errno. + */ +int +xfile_get_page( + struct xfile *xf, + loff_t pos, + unsigned int len, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + struct page *page = NULL; + void *fsdata = NULL; + loff_t key = round_down(pos, PAGE_SIZE); + unsigned int pflags; + int error; + + if (inode->i_sb->s_maxbytes - pos < len) + return -ENOMEM; + if (len > PAGE_SIZE - offset_in_page(pos)) + return -ENOTBLK; + + trace_xfile_get_page(xf, pos, len); + + pflags = memalloc_nofs_save(); + + /* + * We call write_begin directly here to avoid all the freezer + * protection lock-taking that happens in the normal path. shmem + * doesn't support fs freeze, but lockdep doesn't know that and will + * trip over that. + */ + error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, + &fsdata); + if (error) + goto out_pflags; + + /* We got the page, so make sure we push out EOF. */ + if (i_size_read(inode) < pos + len) + i_size_write(inode, pos + len); + + /* + * If the page isn't up to date, fill it with zeroes before we hand it + * to the caller and make sure the backing store will hold on to them. + */ + if (!PageUptodate(page)) { + void *kaddr; + + kaddr = kmap_local_page(page); + memset(kaddr, 0, PAGE_SIZE); + kunmap_local(kaddr); + SetPageUptodate(page); + } + + /* + * Mark each page dirty so that the contents are written to some + * backing store when we drop this buffer, and take an extra reference + * to prevent the xfile page from being swapped or removed from the + * page cache by reclaim if the caller unlocks the page. + */ + set_page_dirty(page); + get_page(page); + + xfpage->page = page; + xfpage->fsdata = fsdata; + xfpage->pos = key; +out_pflags: + memalloc_nofs_restore(pflags); + return error; +} + +/* + * Release the (locked) page for a memory object. Returns 0 or a negative + * errno. + */ +int +xfile_put_page( + struct xfile *xf, + struct xfile_page *xfpage) +{ + struct inode *inode = file_inode(xf->file); + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *aops = mapping->a_ops; + unsigned int pflags; + int ret; + + trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); + + /* Give back the reference that we took in xfile_get_page. */ + put_page(xfpage->page); + + pflags = memalloc_nofs_save(); + ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, + xfpage->page, xfpage->fsdata); + memalloc_nofs_restore(pflags); + memset(xfpage, 0, sizeof(struct xfile_page)); + + if (ret < 0) + return ret; + if (ret != PAGE_SIZE) + return -EIO; + return 0; +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h new file mode 100644 index 000000000000..d56643b0f429 --- /dev/null +++ b/fs/xfs/scrub/xfile.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#ifndef __XFS_SCRUB_XFILE_H__ +#define __XFS_SCRUB_XFILE_H__ + +struct xfile_page { + struct page *page; + void *fsdata; + loff_t pos; +}; + +static inline bool xfile_page_cached(const struct xfile_page *xfpage) +{ + return xfpage->page != NULL; +} + +static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) +{ + return xfpage->page->index; +} + +struct xfile { + struct file *file; +}; + +int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); +void xfile_destroy(struct xfile *xf); + +ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); +ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, + loff_t pos); + +/* + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pread(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +/* + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +static inline int +xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) +{ + ssize_t ret = xfile_pwrite(xf, buf, count, pos); + + if (ret < 0 || ret != count) + return -ENOMEM; + return 0; +} + +loff_t xfile_seek_data(struct xfile *xf, loff_t pos); + +struct xfile_stat { + loff_t size; + unsigned long long bytes; +}; + +int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); + +int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, + struct xfile_page *xbuf); +int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); + +#endif /* __XFS_SCRUB_XFILE_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 791db7d9c849..6b840301817a 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -233,7 +233,7 @@ xfs_acl_set_mode( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xfs_has_wsync(mp)) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 451942fb38ec..465d7630bb21 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -478,7 +478,7 @@ xfs_discard_folio( folio, ip->i_ino, pos); /* - * The end of the punch range is always the offset of the the first + * The end of the punch range is always the offset of the first * byte of the next folio. Hence the end offset is only dependent on the * folio itself and not the start offset that is passed in. */ @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = { .read_folio = xfs_vm_read_folio, .readahead = xfs_vm_readahead, .writepages = xfs_vm_writepages, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5db87b34fb6e..89c7a9f4f930 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -333,7 +333,6 @@ xfs_attr_inactive( int error = 0; mp = dp->i_mount; - ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, lock_mode); if (!xfs_inode_has_attr_fork(dp)) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2788a6f2edcd..36fe2abb16e6 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -547,7 +547,7 @@ xfs_attri_item_recover( struct xfs_inode *ip; struct xfs_da_args *args; struct xfs_trans *tp; - struct xfs_trans_res tres; + struct xfs_trans_res resv; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; @@ -618,8 +618,9 @@ xfs_attri_item_recover( goto out; } - xfs_init_attr_trans(args, &tres, &total); - error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + xfs_init_attr_trans(args, &resv, &total); + resv = xlog_recover_resv(&resv); + error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 7551c3ec4ea5..e736a0844c89 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -490,6 +490,7 @@ xfs_bui_item_recover( struct list_head *capture_list) { struct xfs_bmap_intent fake = { }; + struct xfs_trans_res resv; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; @@ -515,7 +516,8 @@ xfs_bui_item_recover( return error; /* Allocate transaction and do the work. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) goto err_rele; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fbb675563208..731260a5af6d 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -28,6 +28,7 @@ #include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_rtbitmap.h" /* Kernel only BMAP related definitions and functions */ @@ -75,28 +76,28 @@ xfs_bmap_rtalloc( { struct xfs_mount *mp = ap->ip->i_mount; xfs_fileoff_t orig_offset = ap->offset; - xfs_rtblock_t rtb; - xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_rtxnum_t rtx; + xfs_rtxlen_t prod = 0; /* product factor for allocators */ xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_rtxlen_t ralen = 0; /* realtime allocation length */ xfs_extlen_t align; /* minimum allocation alignment */ xfs_extlen_t orig_length = ap->length; xfs_extlen_t minlen = mp->m_sb.sb_rextsize; - xfs_extlen_t raminlen; + xfs_rtxlen_t raminlen; bool rtlocked = false; bool ignore_locality = false; int error; align = xfs_get_extsz_hint(ap->ip); retry: - prod = align / mp->m_sb.sb_rextsize; + prod = xfs_extlen_to_rtxlen(mp, align); error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, ap->conv, &ap->offset, &ap->length); if (error) return error; ASSERT(ap->length); - ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); + ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); /* * If we shifted the file offset downward to satisfy an extent size @@ -116,17 +117,14 @@ retry: prod = 1; /* * Set ralen to be the actual requested length in rtextents. - */ - ralen = ap->length / mp->m_sb.sb_rextsize; - /* + * * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that * we rounded up to it, cut it back so it's valid again. * Note that if it's a really large request (bigger than * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't * adjust the starting point to match it. */ - if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN) - ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize; + ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); /* * Lock out modifications to both the RT bitmap and summary inodes @@ -144,12 +142,10 @@ retry: * pick an extent that will space things out in the rt area. */ if (ap->eof && ap->offset == 0) { - xfs_rtblock_t rtx; /* realtime extent no */ - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); if (error) return error; - ap->blkno = rtx * mp->m_sb.sb_rextsize; + ap->blkno = xfs_rtx_to_rtb(mp, rtx); } else { ap->blkno = 0; } @@ -160,20 +156,18 @@ retry: * Realtime allocation, done through xfs_rtallocate_extent. */ if (ignore_locality) - ap->blkno = 0; + rtx = 0; else - do_div(ap->blkno, mp->m_sb.sb_rextsize); - rtb = ap->blkno; - ap->length = ralen; - raminlen = max_t(xfs_extlen_t, 1, minlen / mp->m_sb.sb_rextsize); - error = xfs_rtallocate_extent(ap->tp, ap->blkno, raminlen, ap->length, - &ralen, ap->wasdel, prod, &rtb); + rtx = xfs_rtb_to_rtx(mp, ap->blkno); + raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, + ap->wasdel, prod, &rtx); if (error) return error; - if (rtb != NULLRTBLOCK) { - ap->blkno = rtb * mp->m_sb.sb_rextsize; - ap->length = ralen * mp->m_sb.sb_rextsize; + if (rtx != NULLRTEXTNO) { + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + ap->length = xfs_rtxlen_to_extlen(mp, ralen); ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -690,7 +684,7 @@ xfs_can_free_eofblocks( */ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) - end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize); + end_fsb = xfs_rtb_roundup_rtx(mp, end_fsb); last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); if (last_fsb <= end_fsb) return false; @@ -780,12 +774,10 @@ xfs_alloc_file_space( { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; - xfs_filblks_t allocated_fsb; xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; - int nimaps; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; @@ -808,7 +800,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; - nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); allocatesize_fsb = endoffset_fsb - startoffset_fsb; @@ -819,6 +810,7 @@ xfs_alloc_file_space( while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; unsigned int dblocks, rblocks, resblks; + int nimaps = 1; /* * Determine space reservations for data/realtime. @@ -884,15 +876,19 @@ xfs_alloc_file_space( if (error) break; - allocated_fsb = imapp->br_blockcount; - - if (nimaps == 0) { - error = -ENOSPC; - break; + /* + * If the allocator cannot find a single free extent large + * enough to cover the start block of the requested range, + * xfs_bmapi_write will return 0 but leave *nimaps set to 0. + * + * In that case we simply need to keep looping with the same + * startoffset_fsb so that one of the following allocations + * will eventually reach the requested range. + */ + if (nimaps) { + startoffset_fsb += imapp->br_blockcount; + allocatesize_fsb -= imapp->br_blockcount; } - - startoffset_fsb += allocated_fsb; - allocatesize_fsb -= allocated_fsb; } return error; @@ -989,10 +985,8 @@ xfs_free_file_space( /* We can only free complete realtime extents. */ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1) { - startoffset_fsb = roundup_64(startoffset_fsb, - mp->m_sb.sb_rextsize); - endoffset_fsb = rounddown_64(endoffset_fsb, - mp->m_sb.sb_rextsize); + startoffset_fsb = xfs_rtb_roundup_rtx(mp, startoffset_fsb); + endoffset_fsb = xfs_rtb_rounddown_rtx(mp, endoffset_fsb); } /* @@ -1644,6 +1638,7 @@ xfs_swap_extents( uint64_t f; int resblks = 0; unsigned int flags = 0; + struct timespec64 ctime, mtime; /* * Lock the inodes against other IO, page faults and truncate to @@ -1756,10 +1751,12 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + ctime = inode_get_ctime(VFS_I(ip)); + mtime = inode_get_mtime(VFS_I(ip)); + if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) { error = -EBUSY; goto out_trans_cancel; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15d1e5a7c2d3..545c7991b9b5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -481,7 +481,8 @@ _xfs_buf_obj_cmp( * reallocating a busy extent. Skip this buffer and * continue searching for an exact match. */ - ASSERT(bp->b_flags & XBF_STALE); + if (!(map->bm_flags & XBM_LIVESCAN)) + ASSERT(bp->b_flags & XBF_STALE); return 1; } return 0; @@ -559,6 +560,10 @@ xfs_buf_find_lock( * intact here. */ if (bp->b_flags & XBF_STALE) { + if (flags & XBF_LIVESCAN) { + xfs_buf_unlock(bp); + return -ENOENT; + } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); bp->b_flags &= _XBF_KMEM | _XBF_PAGES; bp->b_ops = NULL; @@ -682,6 +687,8 @@ xfs_buf_get_map( int error; int i; + if (flags & XBF_LIVESCAN) + cmap.bm_flags |= XBM_LIVESCAN; for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; @@ -1906,8 +1913,7 @@ xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; LIST_HEAD(dispose); unsigned long freed; @@ -1929,8 +1935,7 @@ xfs_buftarg_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; return list_lru_shrink_count(&btp->bt_lru, sc); } @@ -1938,14 +1943,15 @@ void xfs_free_buftarg( struct xfs_buftarg *btp) { - unregister_shrinker(&btp->bt_shrinker); + shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - blkdev_issue_flush(btp->bt_bdev); - invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); + /* the main block device is closed by kill_block_super */ + if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) + bdev_release(btp->bt_bdev_handle); kmem_free(btp); } @@ -1980,16 +1986,15 @@ xfs_setsize_buftarg( */ STATIC int xfs_setsize_buftarg_early( - xfs_buftarg_t *btp, - struct block_device *bdev) + xfs_buftarg_t *btp) { - return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); + return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); } struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev) + struct bdev_handle *bdev_handle) { xfs_buftarg_t *btp; const struct dax_holder_operations *ops = NULL; @@ -2000,9 +2005,10 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; - btp->bt_dev = bdev->bd_dev; - btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, + btp->bt_bdev_handle = bdev_handle; + btp->bt_dev = bdev_handle->bdev->bd_dev; + btp->bt_bdev = bdev_handle->bdev; + btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); /* @@ -2012,7 +2018,7 @@ xfs_alloc_buftarg( ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, DEFAULT_RATELIMIT_BURST); - if (xfs_setsize_buftarg_early(btp, bdev)) + if (xfs_setsize_buftarg_early(btp)) goto error_free; if (list_lru_init(&btp->bt_lru)) @@ -2021,13 +2027,17 @@ xfs_alloc_buftarg( if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) goto error_lru; - btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker.seeks = DEFAULT_SEEKS; - btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", - mp->m_super->s_id)) + btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", + mp->m_super->s_id); + if (!btp->bt_shrinker) goto error_pcpu; + + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + + shrinker_register(btp->bt_shrinker); + return btp; error_pcpu: diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 549c60942208..c86e16419656 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -44,6 +44,11 @@ struct xfs_buf; #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ +/* + * Online fsck is scanning the buffer cache for live buffers. Do not warn + * about length mismatches during lookups and do not return stale buffers. + */ +#define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ #define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ @@ -67,6 +72,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ + { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" }, \ { XBF_UNMAPPED, "UNMAPPED" } @@ -92,6 +98,7 @@ typedef unsigned int xfs_buf_flags_t; */ typedef struct xfs_buftarg { dev_t bt_dev; + struct bdev_handle *bt_bdev_handle; struct block_device *bt_bdev; struct dax_device *bt_daxdev; u64 bt_dax_part_off; @@ -102,7 +109,7 @@ typedef struct xfs_buftarg { size_t bt_logical_sectormask; /* LRU control structures */ - struct shrinker bt_shrinker; + struct shrinker *bt_shrinker; struct list_lru bt_lru; struct percpu_counter bt_io_count; @@ -114,8 +121,15 @@ typedef struct xfs_buftarg { struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ + unsigned int bm_flags; }; +/* + * Online fsck is scanning the buffer cache for live buffers. Do not warn + * about length mismatches during lookups and do not return stale buffers. + */ +#define XBM_LIVESCAN (1U << 0) + #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; @@ -351,7 +365,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, - struct block_device *bdev); + struct bdev_handle *bdev_handle); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index afc4c78b9eed..d5787991bb5b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2010, 2023 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" @@ -19,21 +19,147 @@ #include "xfs_log.h" #include "xfs_ag.h" -STATIC int -xfs_trim_extents( +/* + * Notes on an efficient, low latency fstrim algorithm + * + * We need to walk the filesystem free space and issue discards on the free + * space that meet the search criteria (size and location). We cannot issue + * discards on extents that might be in use, or are so recently in use they are + * still marked as busy. To serialise against extent state changes whilst we are + * gathering extents to trim, we must hold the AGF lock to lock out other + * allocations and extent free operations that might change extent state. + * + * However, we cannot just hold the AGF for the entire AG free space walk whilst + * we issue discards on each free space that is found. Storage devices can have + * extremely slow discard implementations (e.g. ceph RBD) and so walking a + * couple of million free extents and issuing synchronous discards on each + * extent can take a *long* time. Whilst we are doing this walk, nothing else + * can access the AGF, and we can stall transactions and hence the log whilst + * modifications wait for the AGF lock to be released. This can lead hung tasks + * kicking the hung task timer and rebooting the system. This is bad. + * + * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI + * lock, gathers a range of inode cluster buffers that are allocated, drops the + * AGI lock and then reads all the inode cluster buffers and processes them. It + * loops doing this, using a cursor to keep track of where it is up to in the AG + * for each iteration to restart the INOBT lookup from. + * + * We can't do this exactly with free space - once we drop the AGF lock, the + * state of the free extent is out of our control and we cannot run a discard + * safely on it in this situation. Unless, of course, we've marked the free + * extent as busy and undergoing a discard operation whilst we held the AGF + * locked. + * + * This is exactly how online discard works - free extents are marked busy when + * they are freed, and once the extent free has been committed to the journal, + * the busy extent record is marked as "undergoing discard" and the discard is + * then issued on the free extent. Once the discard completes, the busy extent + * record is removed and the extent is able to be allocated again. + * + * In the context of fstrim, if we find a free extent we need to discard, we + * don't have to discard it immediately. All we need to do it record that free + * extent as being busy and under discard, and all the allocation routines will + * now avoid trying to allocate it. Hence if we mark the extent as busy under + * the AGF lock, we can safely discard it without holding the AGF lock because + * nothing will attempt to allocate that free space until the discard completes. + * + * This also allows us to issue discards asynchronously like we do with online + * discard, and so for fast devices fstrim will run much faster as we can have + * multiple discard operations in flight at once, as well as pipeline the free + * extent search so that it overlaps in flight discard IO. + */ + +struct workqueue_struct *xfs_discard_wq; + +static void +xfs_discard_endio_work( + struct work_struct *work) +{ + struct xfs_busy_extents *extents = + container_of(work, struct xfs_busy_extents, endio_work); + + xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); + kmem_free(extents->owner); +} + +/* + * Queue up the actual completion to a thread to avoid IRQ-safe locking for + * pagb_lock. + */ +static void +xfs_discard_endio( + struct bio *bio) +{ + struct xfs_busy_extents *extents = bio->bi_private; + + INIT_WORK(&extents->endio_work, xfs_discard_endio_work); + queue_work(xfs_discard_wq, &extents->endio_work); + bio_put(bio); +} + +/* + * Walk the discard list and issue discards on all the busy extents in the + * list. We plug and chain the bios so that we only need a single completion + * call to clear all the busy extents once the discards are complete. + */ +int +xfs_discard_extents( + struct xfs_mount *mp, + struct xfs_busy_extents *extents) +{ + struct xfs_extent_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + int error = 0; + + blk_start_plug(&plug); + list_for_each_entry(busyp, &extents->extent_list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, &bio); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llx,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + break; + } + } + + if (bio) { + bio->bi_private = extents; + bio->bi_end_io = xfs_discard_endio; + submit_bio(bio); + } else { + xfs_discard_endio_work(&extents->endio_work); + } + blk_finish_plug(&plug); + + return error; +} + + +static int +xfs_trim_gather_extents( struct xfs_perag *pag, xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, + struct xfs_alloc_rec_incore *tcur, + struct xfs_busy_extents *extents, uint64_t *blocks_trimmed) { struct xfs_mount *mp = pag->pag_mount; - struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - struct xfs_agf *agf; int error; int i; + int batch = 100; /* * Force out the log. This means any transactions that might have freed @@ -45,20 +171,28 @@ xfs_trim_extents( error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) return error; - agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); /* - * Look up the longest btree in the AGF and start with it. + * Look up the extent length requested in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); + if (tcur->ar_startblock == NULLAGBLOCK) + error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); + else + error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, + tcur->ar_blockcount, &i); if (error) goto out_del_cursor; + if (i == 0) { + /* nothing of that length left in the AG, we are done */ + tcur->ar_blockcount = 0; + goto out_del_cursor; + } /* * Loop until we are done with all extents that are large - * enough to be worth discarding. + * enough to be worth discarding or we hit batch limits. */ while (i) { xfs_agblock_t fbno; @@ -73,7 +207,16 @@ xfs_trim_extents( error = -EFSCORRUPTED; break; } - ASSERT(flen <= be32_to_cpu(agf->agf_longest)); + + if (--batch <= 0) { + /* + * Update the cursor to point at this extent so we + * restart the next batch from this extent. + */ + tcur->ar_startblock = fbno; + tcur->ar_blockcount = flen; + break; + } /* * use daddr format for all range/len calculations as that is @@ -88,6 +231,7 @@ xfs_trim_extents( */ if (dlen < minlen) { trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + tcur->ar_blockcount = 0; break; } @@ -110,29 +254,103 @@ xfs_trim_extents( goto next_extent; } - trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); - if (error) - break; + xfs_extent_busy_insert_discard(pag, fbno, flen, + &extents->extent_list); *blocks_trimmed += flen; - next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) break; - if (fatal_signal_pending(current)) { - error = -ERESTARTSYS; - break; - } + /* + * If there's no more records in the tree, we are done. Set the + * cursor block count to 0 to indicate to the caller that there + * is no more extents to search. + */ + if (i == 0) + tcur->ar_blockcount = 0; } + /* + * If there was an error, release all the gathered busy extents because + * we aren't going to issue a discard on them any more. + */ + if (error) + xfs_extent_busy_clear(mp, &extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); xfs_buf_relse(agbp); return error; } +static bool +xfs_trim_should_stop(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + +/* + * Iterate the free list gathering extents and discarding them. We need a cursor + * for the repeated iteration of gather/discard loop, so use the longest extent + * we found in the last batch as the key to start the next. + */ +static int +xfs_trim_extents( + struct xfs_perag *pag, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + uint64_t *blocks_trimmed) +{ + struct xfs_alloc_rec_incore tcur = { + .ar_blockcount = pag->pagf_longest, + .ar_startblock = NULLAGBLOCK, + }; + int error = 0; + + do { + struct xfs_busy_extents *extents; + + extents = kzalloc(sizeof(*extents), GFP_KERNEL); + if (!extents) { + error = -ENOMEM; + break; + } + + extents->mount = pag->pag_mount; + extents->owner = extents; + INIT_LIST_HEAD(&extents->extent_list); + + error = xfs_trim_gather_extents(pag, start, end, minlen, + &tcur, extents, blocks_trimmed); + if (error) { + kfree(extents); + break; + } + + /* + * We hand the extent list to the discard function here so the + * discarded extents can be removed from the busy extent list. + * This allows the discards to run asynchronously with gathering + * the next round of extents to discard. + * + * However, we must ensure that we do not reference the extent + * list after this function call, as it may have been freed by + * the time control returns to us. + */ + error = xfs_discard_extents(pag->pag_mount, extents); + if (error) + break; + + if (xfs_trim_should_stop()) + break; + + } while (tcur.ar_blockcount != 0); + + return error; + +} + /* * trim a range of the filesystem. * @@ -195,12 +413,12 @@ xfs_ioc_trim( for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { error = xfs_trim_extents(pag, start, end, minlen, &blocks_trimmed); - if (error) { + if (error) last_error = error; - if (error == -ERESTARTSYS) { - xfs_perag_rele(pag); - break; - } + + if (xfs_trim_should_stop()) { + xfs_perag_rele(pag); + break; } } diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index de92d9cc958f..2b1a85223a56 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -3,8 +3,10 @@ #define XFS_DISCARD_H 1 struct fstrim_range; -struct list_head; +struct xfs_mount; +struct xfs_busy_extents; -extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); +int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7f071757f278..a013b87ab8d5 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -562,7 +562,8 @@ xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { - struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; + struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset); + struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq; /* * Ensure that we got the type and ID we were looking for. @@ -1250,7 +1251,7 @@ xfs_qm_dqflush( } /* Flush the incore dquot to the ondisk buffer. */ - dqblk = bp->b_addr + dqp->q_bufoffset; + dqblk = xfs_buf_offset(bp, dqp->q_bufoffset); xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* @@ -1386,7 +1387,7 @@ xfs_qm_dqiterate( return error; error = iter_fn(dq, type, priv); - id = dq->q_id; + id = dq->q_id + 1; xfs_qm_dqput(dq); } while (error == 0 && id != 0); diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 8966ba842395..2c2720ce6923 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_error.h" STATIC void xlog_recover_dquot_ra_pass2( @@ -65,6 +66,7 @@ xlog_recover_dquot_commit_pass2( { struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; + struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddq, *recddq; struct xfs_dq_logformat *dq_f; xfs_failaddr_t fa; @@ -130,14 +132,14 @@ xlog_recover_dquot_commit_pass2( return error; ASSERT(bp); - ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); + dqb = xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = &dqb->dd_diskdq; /* * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ if (xfs_has_crc(mp)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { @@ -147,10 +149,23 @@ xlog_recover_dquot_commit_pass2( memcpy(ddq, recddq, item->ri_buf[1].i_len); if (xfs_has_crc(mp)) { - xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } + /* Validate the recovered dquot. */ + fa = xfs_dqblk_verify(log->l_mp, dqb, dq_f->qlf_id); + if (fa) { + XFS_CORRUPTION_ERROR("Bad dquot after recovery", + XFS_ERRLEVEL_LOW, mp, dqb, + sizeof(struct xfs_dqblk)); + xfs_alert(mp, + "Metadata corruption detected at %pS, dquot 0x%x", + fa, dq_f->qlf_id); + error = -EFSCORRUPTED; + goto out_release; + } + ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1064c2342876..7cd09c3a82cb 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -146,6 +146,20 @@ xfs_nfs_get_inode( return ERR_PTR(error); } + /* + * Reload the incore unlinked list to avoid failure in inodegc. + * Use an unlocked check here because unrecovered unlinked inodes + * should be somewhat rare. + */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked(ip); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_irele(ip); + return ERR_PTR(error); + } + } + if (VFS_I(ip)->i_generation != generation) { xfs_irele(ip); return ERR_PTR(-ESTALE); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 7c2fdc71e42d..9ecfdcdc752f 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -19,13 +19,13 @@ #include "xfs_log.h" #include "xfs_ag.h" -void -xfs_extent_busy_insert( - struct xfs_trans *tp, +static void +xfs_extent_busy_insert_list( struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, - unsigned int flags) + unsigned int flags, + struct list_head *busy_list) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; @@ -40,7 +40,7 @@ xfs_extent_busy_insert( new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); + trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; @@ -62,10 +62,33 @@ xfs_extent_busy_insert( rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); - list_add(&new->list, &tp->t_busy); + /* always process discard lists in fifo order */ + list_add_tail(&new->list, busy_list); spin_unlock(&pag->pagb_lock); } +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); +} + +void +xfs_extent_busy_insert_discard( + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + struct list_head *busy_list) +{ + xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, + busy_list); +} + /* * Search for a busy extent within the range of the extent we are about to * allocate. You need to be holding the busy extent tree lock when calling diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index c37bf87e6781..0639aab336f3 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -16,9 +16,6 @@ struct xfs_alloc_arg; /* * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_extent_busy_insert() for details. */ struct xfs_extent_busy { struct rb_node rb_node; /* ag by-bno indexed search tree */ @@ -31,11 +28,32 @@ struct xfs_extent_busy { #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ }; +/* + * List used to track groups of related busy extents all the way through + * to discard completion. + */ +struct xfs_busy_extents { + struct xfs_mount *mount; + struct list_head extent_list; + struct work_struct endio_work; + + /* + * Owner is the object containing the struct xfs_busy_extents to free + * once the busy extents have been processed. If only the + * xfs_busy_extents object needs freeing, then point this at itself. + */ + void *owner; +}; + void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void +xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, + xfs_extlen_t len, struct list_head *busy_list); + +void xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, bool do_discard); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index f1a5ecf099aa..3fa8789820ad 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -660,6 +660,7 @@ xfs_efi_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; @@ -683,7 +684,8 @@ xfs_efi_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4f502219ae4f..e33e5e13b95f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -214,6 +214,43 @@ xfs_ilock_iocb( return 0; } +static int +xfs_ilock_iocb_for_write( + struct kiocb *iocb, + unsigned int *lock_mode) +{ + ssize_t ret; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + + ret = xfs_ilock_iocb(iocb, *lock_mode); + if (ret) + return ret; + + if (*lock_mode == XFS_IOLOCK_EXCL) + return 0; + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return 0; + + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); +} + +static unsigned int +xfs_ilock_for_write_fault( + struct xfs_inode *ip) +{ + /* get a shared lock if no remapping in progress */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return XFS_MMAPLOCK_SHARED; + + /* wait for remapping to complete */ + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + return XFS_MMAPLOCK_EXCL; +} + STATIC ssize_t xfs_file_dio_read( struct kiocb *iocb, @@ -551,7 +588,7 @@ xfs_file_dio_write_aligned( unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; ret = xfs_file_write_checks(iocb, from, &iolock); @@ -618,7 +655,7 @@ retry_exclusive: flags = IOMAP_DIO_FORCE_WAIT; } - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -1180,7 +1217,7 @@ xfs_file_remap_range( if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: - xfs_iunlock2_io_mmap(src, dest); + xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return remapped > 0 ? remapped : ret; @@ -1287,11 +1324,11 @@ xfs_file_llseek( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { - return dax_iomap_fault(vmf, pe_size, pfn, NULL, + return dax_iomap_fault(vmf, order, pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_dax_write_iomap_ops : &xfs_read_iomap_ops); @@ -1300,7 +1337,7 @@ xfs_dax_fault( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { @@ -1322,39 +1359,39 @@ xfs_dax_fault( static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; + unsigned int lock_mode = 0; - trace_xfs_filemap_fault(ip, pe_size, write_fault); + trace_xfs_filemap_fault(ip, order, write_fault); if (write_fault) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } + if (IS_DAX(inode) || write_fault) + lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); + if (IS_DAX(inode)) { pfn_t pfn; - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); + ret = xfs_dax_fault(vmf, order, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + ret = dax_finish_sync_fault(vmf, order, pfn); + } else if (write_fault) { + ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); } else { - if (write_fault) { - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = iomap_page_mkwrite(vmf, - &xfs_page_mkwrite_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - } else { - ret = filemap_fault(vmf); - } + ret = filemap_fault(vmf); } + if (lock_mode) + xfs_iunlock(XFS_I(inode), lock_mode); + if (write_fault) sb_end_pagefault(inode->i_sb); return ret; @@ -1373,7 +1410,7 @@ xfs_filemap_fault( struct vm_fault *vmf) { /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + return __xfs_filemap_fault(vmf, 0, IS_DAX(file_inode(vmf->vma->vm_file)) && xfs_is_write_fault(vmf)); } @@ -1381,13 +1418,13 @@ xfs_filemap_fault( static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, pe_size, + return __xfs_filemap_fault(vmf, order, xfs_is_write_fault(vmf)); } @@ -1395,7 +1432,7 @@ static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } /* @@ -1408,7 +1445,7 @@ xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10403ba9b58f..5a72217f5feb 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -23,7 +23,7 @@ #include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_alloc_btree.h" -#include "xfs_rtalloc.h" +#include "xfs_rtbitmap.h" #include "xfs_ag.h" /* Convert an xfs_fsmap to an fsmap. */ @@ -483,11 +483,11 @@ xfs_getfsmap_rtdev_rtbitmap_helper( xfs_rtblock_t rtbno; xfs_daddr_t rec_daddr, len_daddr; - rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); rec_daddr = XFS_FSB_TO_BB(mp, rtbno); irec.rm_startblock = rtbno; - rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize; + rtbno = xfs_rtx_to_rtb(mp, rec->ar_extcount); len_daddr = XFS_FSB_TO_BB(mp, rtbno); irec.rm_blockcount = rtbno; @@ -514,7 +514,7 @@ xfs_getfsmap_rtdev_rtbitmap( uint64_t eofs; int error; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize); + eofs = XFS_FSB_TO_BB(mp, xfs_rtx_to_rtb(mp, mp->m_sb.sb_rextents)); if (keys[0].fmr_physical >= eofs) return 0; start_rtb = XFS_BB_TO_FSBT(mp, @@ -539,11 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap( * Set up query parameters to return free rtextents covering the range * we want. */ - alow.ar_startext = start_rtb; - ahigh.ar_startext = end_rtb; - do_div(alow.ar_startext, mp->m_sb.sb_rextsize); - if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) - ahigh.ar_startext++; + alow.ar_startext = xfs_rtb_to_rtx(mp, start_rtb); + ahigh.ar_startext = xfs_rtb_to_rtxup(mp, end_rtb); error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) @@ -565,6 +562,19 @@ err: } #endif /* CONFIG_XFS_RT */ +static inline bool +rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r) +{ + if (!xfs_has_reflink(mp)) + return true; + if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner)) + return true; + if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return true; + return false; +} + /* Execute a getfsmap query against the regular data device. */ STATIC int __xfs_getfsmap_datadev( @@ -598,7 +608,6 @@ __xfs_getfsmap_datadev( * low to the fsmap low key and max out the high key to the end * of the AG. */ - info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) @@ -608,12 +617,9 @@ __xfs_getfsmap_datadev( /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { - /* empty */ - } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || - (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | - XFS_RMAP_BMBT_BLOCK | - XFS_RMAP_UNWRITTEN))) { - info->low.rm_startblock += info->low.rm_blockcount; + /* No previous record from which to continue */ + } else if (rmap_not_shareable(mp, &info->low)) { + /* Last record seen was an unshareable extent */ info->low.rm_owner = 0; info->low.rm_offset = 0; @@ -621,8 +627,10 @@ __xfs_getfsmap_datadev( if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) return 0; } else { + /* Last record seen was a shareable file data extent */ info->low.rm_offset += info->low.rm_blockcount; } + info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 453890942d9f..dba514a2c84d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -113,7 +113,7 @@ xfs_inode_alloc( INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return ip; } @@ -443,7 +443,7 @@ xfs_inodegc_queue_all( int cpu; bool ret = false; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); @@ -463,7 +463,7 @@ xfs_inodegc_wait_all( int error = 0; flush_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { struct xfs_inodegc *gc; gc = per_cpu_ptr(mp->m_inodegc, cpu); @@ -803,44 +803,6 @@ out_error_or_again: } /* - * "Is this a cached inode that's also allocated?" - * - * Look up an inode by number in the given file system. If the inode is - * in cache and isn't in purgatory, return 1 if the inode is allocated - * and 0 if it is not. For all other cases (not in cache, being torn - * down, etc.), return a negative error code. - * - * The caller has to prevent inode allocation and freeing activity, - * presumably by locking the AGI buffer. This is to ensure that an - * inode cannot transition from allocated to freed until the caller is - * ready to allow that. If the inode is in an intermediate state (new, - * reclaimable, or being reclaimed), -EAGAIN will be returned; if the - * inode is not in the cache, -ENOENT will be returned. The caller must - * deal with these scenarios appropriately. - * - * This is a specialized use case for the online scrubber; if you're - * reading this, you probably want xfs_iget. - */ -int -xfs_icache_inode_is_allocated( - struct xfs_mount *mp, - struct xfs_trans *tp, - xfs_ino_t ino, - bool *inuse) -{ - struct xfs_inode *ip; - int error; - - error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); - if (error) - return error; - - *inuse = !!(VFS_I(ip)->i_mode); - xfs_irele(ip); - return 0; -} - -/* * Grab the inode for reclaim exclusively. * * We have found this inode via a lookup under RCU, so the inode may have @@ -1883,9 +1845,17 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; + struct xfs_mount *mp = gc->mp; unsigned int nofs_flag; - ASSERT(gc->cpu == smp_processor_id()); + /* + * Clear the cpu mask bit and ensure that we have seen the latest + * update of the gc structure associated with this CPU. This matches + * with the release semantics used when setting the cpumask bit in + * xfs_inodegc_queue. + */ + cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask); + smp_mb__after_atomic(); WRITE_ONCE(gc->items, 0); @@ -1900,7 +1870,7 @@ xfs_inodegc_worker( nofs_flag = memalloc_nofs_save(); ip = llist_entry(node, struct xfs_inode, i_gclist); - trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); + trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits)); WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { @@ -2095,6 +2065,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned int cpu_nr; unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); @@ -2102,18 +2073,28 @@ xfs_inodegc_queue( ip->i_flags |= XFS_NEED_INACTIVE; spin_unlock(&ip->i_flags_lock); - gc = get_cpu_ptr(mp->m_inodegc); + cpu_nr = get_cpu(); + gc = this_cpu_ptr(mp->m_inodegc); llist_add(&ip->i_gclist, &gc->list); items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); /* + * Ensure the list add is always seen by anyone who finds the cpumask + * bit set. This effectively gives the cpumask bit set operation + * release ordering semantics. + */ + smp_mb__before_atomic(); + if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask)) + cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask); + + /* * We queue the work while holding the current CPU so that the work * is scheduled to run on this CPU. */ if (!xfs_is_inodegc_enabled(mp)) { - put_cpu_ptr(gc); + put_cpu(); return; } @@ -2123,7 +2104,7 @@ xfs_inodegc_queue( trace_xfs_inodegc_queue(mp, __return_address); mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, queue_delay); - put_cpu_ptr(gc); + put_cpu(); if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); @@ -2132,47 +2113,6 @@ xfs_inodegc_queue( } /* - * Fold the dead CPU inodegc queue into the current CPUs queue. - */ -void -xfs_inodegc_cpu_dead( - struct xfs_mount *mp, - unsigned int dead_cpu) -{ - struct xfs_inodegc *dead_gc, *gc; - struct llist_node *first, *last; - unsigned int count = 0; - - dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_delayed_work_sync(&dead_gc->work); - - if (llist_empty(&dead_gc->list)) - return; - - first = dead_gc->list.first; - last = first; - while (last->next) { - last = last->next; - count++; - } - dead_gc->list.first = NULL; - dead_gc->items = 0; - - /* Add pending work to current CPU */ - gc = get_cpu_ptr(mp->m_inodegc); - llist_add_batch(first, last, &gc->list); - count += READ_ONCE(gc->items); - WRITE_ONCE(gc->items, count); - - if (xfs_is_inodegc_enabled(mp)) { - trace_xfs_inodegc_queue(mp, __return_address); - mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, - 0); - } - put_cpu_ptr(gc); -} - -/* * We set the inode flag atomically with the radix tree tag. Once we get tag * lookups on the radix tree, this inode flag can go away. * @@ -2225,15 +2165,14 @@ xfs_inodegc_shrinker_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; if (!xfs_is_inodegc_enabled(mp)) return 0; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) return XFS_INODEGC_SHRINKER_COUNT; @@ -2247,8 +2186,7 @@ xfs_inodegc_shrinker_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; bool no_items = true; @@ -2258,7 +2196,7 @@ xfs_inodegc_shrinker_scan( trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { unsigned int h = READ_ONCE(gc->shrinker_hits); @@ -2284,13 +2222,19 @@ int xfs_inodegc_register_shrinker( struct xfs_mount *mp) { - struct shrinker *shrink = &mp->m_inodegc_shrinker; + mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, + "xfs-inodegc:%s", + mp->m_super->s_id); + if (!mp->m_inodegc_shrinker) + return -ENOMEM; - shrink->count_objects = xfs_inodegc_shrinker_count; - shrink->scan_objects = xfs_inodegc_shrinker_scan; - shrink->seeks = 0; - shrink->flags = SHRINKER_NONSLAB; - shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; + mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; + mp->m_inodegc_shrinker->seeks = 0; + mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; + mp->m_inodegc_shrinker->private_data = mp; - return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); + shrinker_register(mp->m_inodegc_shrinker); + + return 0; } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 1dcdcb23796e..905944dafbe5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -71,10 +71,6 @@ void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); - -int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_ino_t ino, bool *inuse); - void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); @@ -83,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp); int xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); -void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); int xfs_inodegc_register_shrinker(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9e62cc500140..c0f1c89786c2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -843,10 +843,9 @@ xfs_init_new_inode( ip->i_df.if_nextents = 0; ASSERT(ip->i_nblocks == 0); - tv = current_time(inode); - inode->i_mtime = tv; - inode->i_atime = tv; - inode->i_ctime = tv; + tv = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, tv); + inode_set_atime_to_ts(inode, tv); ip->i_extsize = 0; ip->i_diflags = 0; @@ -919,6 +918,13 @@ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) { + if (VFS_I(ip)->i_nlink == 0) { + xfs_alert(ip->i_mount, + "%s: Attempt to drop inode (%llu) with nlink zero.", + __func__, ip->i_ino); + return -EFSCORRUPTED; + } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); drop_nlink(VFS_I(ip)); @@ -1643,8 +1649,11 @@ xfs_inode_needs_inactive( if (VFS_I(ip)->i_mode == 0) return false; - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) return false; /* If the log isn't running, push inodes straight to reclaim. */ @@ -1704,8 +1713,11 @@ xfs_inactive( mp = ip->i_mount; ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) goto out; /* Metadata inodes require explicit resource cleanup. */ @@ -1737,9 +1749,21 @@ xfs_inactive( ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip); - if (error) - goto out; + if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { + /* + * If this inode is being inactivated during a quotacheck and + * has not yet been scanned by quotacheck, we /must/ remove + * the dquots from the inode before inactivation changes the + * block and inode counts. Most probably this is a result of + * reloading the incore iunlinked list to purge unrecovered + * unlinked inodes. + */ + xfs_qm_dqdetach(ip); + } else { + error = xfs_qm_dqattach(ip); + if (error) + goto out; + } if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); @@ -1823,12 +1847,17 @@ xfs_iunlink_lookup( rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* Caller can handle inode not being in memory. */ + rcu_read_unlock(); + return NULL; + } /* - * Inode not in memory or in RCU freeing limbo should not happen. - * Warn about this and let the caller handle the failure. + * Inode in RCU freeing limbo should not happen. Warn about this and + * let the caller handle the failure. */ - if (WARN_ON_ONCE(!ip || !ip->i_ino)) { + if (WARN_ON_ONCE(!ip->i_ino)) { rcu_read_unlock(); return NULL; } @@ -1837,7 +1866,10 @@ xfs_iunlink_lookup( return ip; } -/* Update the prev pointer of the next agino. */ +/* + * Update the prev pointer of the next agino. Returns -ENOLINK if the inode + * is not in cache. + */ static int xfs_iunlink_update_backref( struct xfs_perag *pag, @@ -1852,7 +1884,8 @@ xfs_iunlink_update_backref( ip = xfs_iunlink_lookup(pag, next_agino); if (!ip) - return -EFSCORRUPTED; + return -ENOLINK; + ip->i_prev_unlinked = prev_agino; return 0; } @@ -1896,6 +1929,64 @@ xfs_iunlink_update_bucket( return 0; } +/* + * Load the inode @next_agino into the cache and set its prev_unlinked pointer + * to @prev_agino. Caller must hold the AGI to synchronize with other changes + * to the unlinked list. + */ +STATIC int +xfs_iunlink_reload_next( + struct xfs_trans *tp, + struct xfs_buf *agibp, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_perag *pag = agibp->b_pag; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *next_ip = NULL; + xfs_ino_t ino; + int error; + + ASSERT(next_agino != NULLAGINO); + +#ifdef DEBUG + rcu_read_lock(); + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); + ASSERT(next_ip == NULL); + rcu_read_unlock(); +#endif + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", + next_agino, pag->pag_agno); + + /* + * Use an untrusted lookup just to be cautious in case the AGI has been + * corrupted and now points at a free inode. That shouldn't happen, + * but we'd rather shut down now since we're already running in a weird + * situation. + */ + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); + if (error) + return error; + + /* If this is not an unlinked inode, something is very wrong. */ + if (VFS_I(next_ip)->i_nlink != 0) { + error = -EFSCORRUPTED; + goto rele; + } + + next_ip->i_prev_unlinked = prev_agino; + trace_xfs_iunlink_reload_next(next_ip); +rele: + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + if (xfs_is_quotacheck_running(mp) && next_ip) + xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); + xfs_irele(next_ip); + return error; +} + static int xfs_iunlink_insert_inode( struct xfs_trans *tp, @@ -1927,6 +2018,8 @@ xfs_iunlink_insert_inode( * inode. */ error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); if (error) return error; @@ -1942,6 +2035,7 @@ xfs_iunlink_insert_inode( } /* Point the head of the list to point to this inode. */ + ip->i_prev_unlinked = NULLAGINO; return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); } @@ -2021,6 +2115,9 @@ xfs_iunlink_remove_inode( */ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, ip->i_next_unlinked); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, + ip->i_next_unlinked); if (error) return error; @@ -2041,7 +2138,7 @@ xfs_iunlink_remove_inode( } ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return error; } @@ -3530,3 +3627,134 @@ xfs_iunlock2_io_mmap( if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } + +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ +void +xfs_iunlock2_remapping( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + xfs_iflags_clear(ip1, XFS_IREMAPPING); + + if (ip1 != ip2) + xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + + if (ip1 != ip2) + inode_unlock_shared(VFS_I(ip1)); + inode_unlock(VFS_I(ip2)); +} + +/* + * Reload the incore inode list for this inode. Caller should ensure that + * the link count cannot change, either by taking ILOCK_SHARED or otherwise + * preventing other threads from executing. + */ +int +xfs_inode_reload_unlinked_bucket( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agibp; + struct xfs_agi *agi; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t prev_agino, next_agino; + unsigned int bucket; + bool foundit = false; + int error; + + /* Grab the first inode in the list */ + pag = xfs_perag_get(mp, agno); + error = xfs_ialloc_read_agi(pag, tp, &agibp); + xfs_perag_put(pag); + if (error) + return error; + + /* + * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the + * incore unlinked list pointers for this inode. Check once more to + * see if we raced with anyone else to reload the unlinked list. + */ + if (!xfs_inode_unlinked_incomplete(ip)) { + foundit = true; + goto out_agibp; + } + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + agi = agibp->b_addr; + + trace_xfs_inode_reload_unlinked_bucket(ip); + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", + agino, agno); + + prev_agino = NULLAGINO; + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + struct xfs_inode *next_ip = NULL; + + /* Found this caller's inode, set its backlink. */ + if (next_agino == agino) { + next_ip = ip; + next_ip->i_prev_unlinked = prev_agino; + foundit = true; + goto next_inode; + } + + /* Try in-memory lookup first. */ + next_ip = xfs_iunlink_lookup(pag, next_agino); + if (next_ip) + goto next_inode; + + /* Inode not in memory, try reloading it. */ + error = xfs_iunlink_reload_next(tp, agibp, prev_agino, + next_agino); + if (error) + break; + + /* Grab the reloaded inode. */ + next_ip = xfs_iunlink_lookup(pag, next_agino); + if (!next_ip) { + /* No incore inode at all? We reloaded it... */ + ASSERT(next_ip != NULL); + error = -EFSCORRUPTED; + break; + } + +next_inode: + prev_agino = next_agino; + next_agino = next_ip->i_next_unlinked; + } + +out_agibp: + xfs_trans_brelse(tp, agibp); + /* Should have found this inode somewhere in the iunlinked bucket. */ + if (!error && !foundit) + error = -EFSCORRUPTED; + return error; +} + +/* Decide if this inode is missing its unlinked list and reload it. */ +int +xfs_inode_reload_unlinked( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_inode_unlinked_incomplete(ip)) + error = xfs_inode_reload_unlinked_bucket(tp, ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_trans_cancel(tp); + + return error; +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7547caf2f2ab..3beb470f1892 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -68,8 +68,21 @@ typedef struct xfs_inode { uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ - /* unlinked list pointers */ + /* + * Unlinked list pointers. These point to the next and previous inodes + * in the AGI unlinked bucket list, respectively. These fields can + * only be updated with the AGI locked. + * + * i_next_unlinked caches di_next_unlinked. + */ xfs_agino_t i_next_unlinked; + + /* + * If the inode is not on an unlinked list, this field is zero. If the + * inode is the first element in an unlinked list, this field is + * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode + * in the unlinked list. + */ xfs_agino_t i_prev_unlinked; /* VFS inode */ @@ -81,6 +94,11 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) +{ + return ip->i_prev_unlinked != 0; +} + static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) { return ip->i_forkoff > 0; @@ -326,6 +344,17 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) */ #define XFS_INACTIVATING (1 << 13) +/* Quotacheck is running but inode has not been added to quota counts. */ +#define XFS_IQUOTAUNCHECKED (1 << 14) + +/* + * Remap in progress. Callers that wish to update file data while + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake + * the lock in exclusive mode. Relocking the file will block until + * IREMAPPING is cleared. + */ +#define XFS_IREMAPPING (1U << 15) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -340,7 +369,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ - XFS_INACTIVATING) + XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) /* * Flags for inode locking. @@ -540,6 +569,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +static inline void xfs_update_stable_writes(struct xfs_inode *ip) +{ + if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) + mapping_set_stable_writes(VFS_I(ip)->i_mapping); + else + mapping_clear_stable_writes(VFS_I(ip)->i_mapping); +} + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at @@ -574,5 +611,15 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); + +static inline bool +xfs_inode_unlinked_incomplete( + struct xfs_inode *ip) +{ + return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); +} +int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_inode_reload_unlinked(struct xfs_inode *ip); #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 91c847a84e10..cd7803fda8b1 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_error.h" +#include "xfs_rtbitmap.h" #include <linux/iversion.h> @@ -107,7 +108,7 @@ xfs_inode_item_precommit( */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT); ip->i_extsize = 0; @@ -526,9 +527,9 @@ xfs_inode_to_log_dinode( to->di_projid_hi = ip->i_projid >> 16; memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); - to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode)); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode)); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 0e5dba2343ea..144198a6b270 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2( struct xfs_log_dinode *ldip; uint isize; int need_free = 0; + xfs_failaddr_t fa; if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; @@ -369,24 +370,26 @@ xlog_recover_inode_commit_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_has_v3inodes(mp) && - ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { - /* - * Deal with the wrap case, DI_MAX_FLUSH is less - * than smaller numbers - */ - if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { - /* do nothing */ - } else { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_release; + if (!xfs_has_v3inodes(mp)) { + if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } } + + /* Take the opportunity to reset the flush iteration count */ + ldip->di_flushiter = 0; } - /* Take the opportunity to reset the flush iteration count */ - ldip->di_flushiter = 0; if (unlikely(S_ISREG(ldip->di_mode))) { if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && @@ -528,8 +531,19 @@ out_owner_change: (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); - /* re-generate the checksum. */ + /* re-generate the checksum and validate the recovered inode. */ xfs_dinode_calc_crc(log->l_mp, dip); + fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip); + if (fa) { + XFS_CORRUPTION_ERROR( + "Bad dinode after recovery", + XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip)); + xfs_alert(mp, + "Metadata corruption detected at %pS, inode 0x%llx", + fa, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 55bb01173cde..6c3919687ea6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -38,6 +38,7 @@ #include "xfs_reflink.h" #include "xfs_ioctl.h" #include "xfs_xattr.h" +#include "xfs_rtbitmap.h" #include <linux/mount.h> #include <linux/namei.h> @@ -1004,7 +1005,7 @@ xfs_fill_fsxattr( * later. */ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - ip->i_extsize % mp->m_sb.sb_rextsize > 0) { + xfs_extlen_to_rtxmod(mp, ip->i_extsize) > 0) { fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); fa->fsx_extsize = 0; @@ -1120,23 +1121,25 @@ xfs_ioctl_setattr_xflags( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; + bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); uint64_t i_flags2; - /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_df.if_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) - return -EINVAL; + if (rtflag != XFS_IS_REALTIME_INODE(ip)) { + /* Can't change realtime flag if any extents are allocated. */ + if (ip->i_df.if_nextents || ip->i_delayed_blks) + return -EINVAL; + } - /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & FS_XFLAG_REALTIME) { + if (rtflag) { + /* If realtime flag is set then must have realtime device */ if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || - (ip->i_extsize % mp->m_sb.sb_rextsize)) + xfs_extlen_to_rtxmod(mp, ip->i_extsize)) return -EINVAL; - } - /* Clear reflink if we are actually able to set the rt flag. */ - if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + /* Clear reflink if we are actually able to set the rt flag. */ + if (xfs_is_reflink_inode(ip)) + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + } /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); @@ -1147,6 +1150,14 @@ xfs_ioctl_setattr_xflags( ip->i_diflags2 = i_flags2; xfs_diflags_to_iflags(ip, false); + + /* + * Make the stable writes flag match that of the device the inode + * resides on when flipping the RT flag. + */ + if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode)) + xfs_update_stable_writes(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index c14852362fce..052d0e888c27 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -22,7 +22,7 @@ /* * On intel, even if sizes match, alignment and/or padding may differ. */ -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) #define BROKEN_X86_ALIGNMENT #define __compat_packed __attribute__((packed)) #else diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 24718adb3c16..a0d77f5f512e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -572,9 +572,9 @@ xfs_vn_getattr( stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; + stat->atime = inode_get_atime(inode); + stat->mtime = inode_get_mtime(inode); + stat->ctime = inode_get_ctime(inode); stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); if (xfs_has_v3inodes(mp)) { @@ -584,6 +584,11 @@ xfs_vn_getattr( } } + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->change_cookie = inode_query_iversion(inode); + stat->result_mask |= STATX_CHANGE_COOKIE; + } + /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -1029,7 +1034,6 @@ xfs_vn_setattr( STATIC int xfs_vn_update_time( struct inode *inode, - struct timespec64 *now, int flags) { struct xfs_inode *ip = XFS_I(inode); @@ -1037,13 +1041,16 @@ xfs_vn_update_time( int log_flags = XFS_ILOG_TIMESTAMP; struct xfs_trans *tp; int error; + struct timespec64 now; trace_xfs_update_time(ip); if (inode->i_sb->s_flags & SB_LAZYTIME) { if (!((flags & S_VERSION) && - inode_maybe_inc_iversion(inode, false))) - return generic_update_time(inode, now, flags); + inode_maybe_inc_iversion(inode, false))) { + generic_update_time(inode, flags); + return 0; + } /* Capture the iversion update that just occurred */ log_flags |= XFS_ILOG_CORE; @@ -1054,12 +1061,15 @@ xfs_vn_update_time( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); - if (flags & S_CTIME) - inode->i_ctime = *now; + if (flags & (S_CTIME|S_MTIME)) + now = inode_set_ctime_current(inode); + else + now = current_time(inode); + if (flags & S_MTIME) - inode->i_mtime = *now; + inode_set_mtime_to_ts(inode, now); if (flags & S_ATIME) - inode->i_atime = *now; + inode_set_atime_to_ts(inode, now); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, log_flags); @@ -1289,6 +1299,13 @@ xfs_setup_inode( mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); /* + * For real-time inodes update the stable write flags to that of the RT + * device instead of the data device. + */ + if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip)) + xfs_update_stable_writes(ip); + + /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f225413a993c..14462614fcc8 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -80,6 +80,17 @@ xfs_bulkstat_one_int( if (error) goto out; + /* Reload the incore unlinked list to avoid failure in inodegc. */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked_bucket(tp, ip); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_irele(ip); + return error; + } + } + ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); @@ -96,12 +107,12 @@ xfs_bulkstat_one_int( buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; - buf->bs_atime = inode->i_atime.tv_sec; - buf->bs_atime_nsec = inode->i_atime.tv_nsec; - buf->bs_mtime = inode->i_mtime.tv_sec; - buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime = inode->i_ctime.tv_sec; - buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; + buf->bs_atime = inode_get_atime_sec(inode); + buf->bs_atime_nsec = inode_get_atime_nsec(inode); + buf->bs_mtime = inode_get_mtime_sec(inode); + buf->bs_mtime_nsec = inode_get_mtime_nsec(inode); + buf->bs_ctime = inode_get_ctime_sec(inode); + buf->bs_ctime_nsec = inode_get_ctime_nsec(inode); buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 74dcb05069e8..d7873e0360f0 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -63,6 +63,7 @@ typedef __u32 xfs_nlink_t; #include <linux/rhashtable.h> #include <linux/xattr.h> #include <linux/mnt_idmapping.h> +#include <linux/debugfs.h> #include <asm/page.h> #include <asm/div64.h> @@ -197,6 +198,18 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) return x; } +/* If @b is a power of 2, return log2(b). Else return -1. */ +static inline int8_t log2_if_power2(unsigned long b) +{ + return is_power_of_2(b) ? ilog2(b) : -1; +} + +/* If @b is a power of 2, return a mask of the lower bits, else return zero. */ +static inline unsigned long long mask64_if_power2(unsigned long b) +{ + return is_power_of_2(b) ? b - 1 : 0; +} + int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, enum req_op op); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 79004d193e54..ee206facf0dc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -715,15 +715,7 @@ xfs_log_mount( * just worked. */ if (!xfs_has_norecovery(mp)) { - /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, - &mp->m_opstate); error = xlog_recover(log); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); @@ -772,7 +764,6 @@ xfs_log_mount_finish( struct xfs_mount *mp) { struct xlog *log = mp->m_log; - bool readonly; int error = 0; if (xfs_has_norecovery(mp)) { @@ -781,12 +772,6 @@ xfs_log_mount_finish( } /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - - /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -835,8 +820,6 @@ xfs_log_mount_finish( xfs_buftarg_drain(mp->m_ddev_targp); clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ ASSERT(!error || xlog_is_shutdown(log)); @@ -1910,9 +1893,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog); - up(&iclog->ic_sema); - return; + goto sync; } /* @@ -1942,20 +1923,17 @@ xlog_write_iclog( * avoid shutdown re-entering this path and erroring out again. */ if (log->l_targ != log->l_mp->m_ddev_targp && - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) + goto shutdown; } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) + goto shutdown; + if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -1976,6 +1954,12 @@ xlog_write_iclog( } submit_bio(&iclog->ic_bio); + return; +shutdown: + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); +sync: + xlog_state_done_syncing(iclog); + up(&iclog->ic_sema); } /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index eccbfb99e894..67a99d94701e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -16,8 +16,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_trace.h" - -struct workqueue_struct *xfs_discard_wq; +#include "xfs_discard.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); @@ -124,7 +123,7 @@ xlog_cil_push_pcp_aggregate( struct xlog_cil_pcp *cilpcp; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &ctx->cil_pcpmask) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); ctx->ticket->t_curr_res += cilpcp->space_reserved; @@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate( if (!list_empty(&cilpcp->busy_extents)) { list_splice_init(&cilpcp->busy_extents, - &ctx->busy_extents); + &ctx->busy_extents.extent_list); } if (!list_empty(&cilpcp->log_items)) list_splice_init(&cilpcp->log_items, &ctx->log_items); @@ -165,7 +164,13 @@ xlog_cil_insert_pcp_aggregate( if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) return; - for_each_online_cpu(cpu) { + /* + * We can race with other cpus setting cil_pcpmask. However, we've + * atomically cleared PCP_SPACE which forces other threads to add to + * the global space used count. cil_pcpmask is a superset of cilpcp + * structures that could have a nonzero space_used. + */ + for_each_cpu(cpu, &ctx->cil_pcpmask) { int old, prev; cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); @@ -554,6 +559,7 @@ xlog_cil_insert_items( int iovhdr_res = 0, split_res = 0, ctx_res = 0; int space_used; int order; + unsigned int cpu_nr; struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -577,7 +583,12 @@ xlog_cil_insert_items( * can't be scheduled away between split sample/update operations that * are done without outside locking to serialise them. */ - cilpcp = get_cpu_ptr(cil->xc_pcp); + cpu_nr = get_cpu(); + cilpcp = this_cpu_ptr(cil->xc_pcp); + + /* Tell the future push that there was work added by this CPU. */ + if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask)) + cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask); /* * We need to take the CIL checkpoint unit reservation on the first @@ -663,7 +674,7 @@ xlog_cil_insert_items( continue; list_add_tail(&lip->li_cil, &cilpcp->log_items); } - put_cpu_ptr(cilpcp); + put_cpu(); /* * If we've overrun the reservation, dump the tx details before we move @@ -696,76 +707,6 @@ xlog_cil_free_logvec( } } -static void -xlog_discard_endio_work( - struct work_struct *work) -{ - struct xfs_cil_ctx *ctx = - container_of(work, struct xfs_cil_ctx, discard_endio_work); - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; - - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); - kmem_free(ctx); -} - -/* - * Queue up the actual completion to a thread to avoid IRQ-safe locking for - * pagb_lock. Note that we need a unbounded workqueue, otherwise we might - * get the execution delayed up to 30 seconds for weird reasons. - */ -static void -xlog_discard_endio( - struct bio *bio) -{ - struct xfs_cil_ctx *ctx = bio->bi_private; - - INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); - queue_work(xfs_discard_wq, &ctx->discard_endio_work); - bio_put(bio); -} - -static void -xlog_discard_busy_extents( - struct xfs_mount *mp, - struct xfs_cil_ctx *ctx) -{ - struct list_head *list = &ctx->busy_extents; - struct xfs_extent_busy *busyp; - struct bio *bio = NULL; - struct blk_plug plug; - int error = 0; - - ASSERT(xfs_has_discard(mp)); - - blk_start_plug(&plug); - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, &bio); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - break; - } - } - - if (bio) { - bio->bi_private = ctx; - bio->bi_end_io = xlog_discard_endio; - submit_bio(bio); - } else { - xlog_discard_endio_work(&ctx->discard_endio_work); - } - blk_finish_plug(&plug); -} - /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -795,8 +736,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); - xfs_extent_busy_sort(&ctx->busy_extents); - xfs_extent_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents.extent_list); + xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); @@ -805,10 +746,14 @@ xlog_cil_committed( xlog_cil_free_logvec(&ctx->lv_chain); - if (!list_empty(&ctx->busy_extents)) - xlog_discard_busy_extents(mp, ctx); - else - kmem_free(ctx); + if (!list_empty(&ctx->busy_extents.extent_list)) { + ctx->busy_extents.mount = mp; + ctx->busy_extents.owner = ctx; + xfs_discard_extents(mp, &ctx->busy_extents); + return; + } + + kmem_free(ctx); } void @@ -1791,38 +1736,6 @@ out_shutdown: } /* - * Move dead percpu state to the relevant CIL context structures. - * - * We have to lock the CIL context here to ensure that nothing is modifying - * the percpu state, either addition or removal. Both of these are done under - * the CIL context lock, so grabbing that exclusively here will ensure we can - * safely drain the cilpcp for the CPU that is dying. - */ -void -xlog_cil_pcp_dead( - struct xlog *log, - unsigned int cpu) -{ - struct xfs_cil *cil = log->l_cilp; - struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); - struct xfs_cil_ctx *ctx; - - down_write(&cil->xc_ctx_lock); - ctx = cil->xc_ctx; - if (ctx->ticket) - ctx->ticket->t_curr_res += cilpcp->space_reserved; - cilpcp->space_reserved = 0; - - if (!list_empty(&cilpcp->log_items)) - list_splice_init(&cilpcp->log_items, &ctx->log_items); - if (!list_empty(&cilpcp->busy_extents)) - list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); - atomic_add(cilpcp->space_used, &ctx->space_used); - cilpcp->space_used = 0; - up_write(&cil->xc_ctx_lock); -} - -/* * Perform initial CIL structure initialisation. */ int diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1bd2963e8fbd..fa3ad1d7b31c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -6,6 +6,8 @@ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ +#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ + struct xfs_buf; struct xlog; struct xlog_ticket; @@ -223,14 +225,19 @@ struct xfs_cil_ctx { struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ atomic_t space_used; /* aggregate size of regions */ - struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_busy_extents busy_extents; struct list_head log_items; /* log items in chkpt */ struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ - struct work_struct discard_endio_work; struct work_struct push_work; atomic_t order_id; + + /* + * CPUs that could have added items to the percpu CIL data. Access is + * coordinated with xc_ctx_lock. + */ + struct cpumask cil_pcpmask; }; /* @@ -278,9 +285,6 @@ struct xfs_cil { wait_queue_head_t xc_push_wait; /* background push throttle */ void __percpu *xc_pcp; /* percpu CIL structures */ -#ifdef CONFIG_HOTPLUG_CPU - struct list_head xc_pcp_list; -#endif } ____cacheline_aligned_in_smp; /* xc_flags bit values */ @@ -705,9 +709,4 @@ xlog_kvmalloc( return p; } -/* - * CIL CPU dead notifier - */ -void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu); - #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 82c81d20459d..a1e18b24971a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -329,7 +329,7 @@ xlog_find_verify_cycle( * try a smaller size. We need to be able to read at least * a log sector, or we're out of luck. */ - bufblks = 1 << ffs(nbblks); + bufblks = roundup_pow_of_two(nbblks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { @@ -1528,7 +1528,7 @@ xlog_write_log_records( * a smaller size. We need to be able to write at least a * log sector, or we're out of luck. */ - bufblks = 1 << ffs(blocks); + bufblks = roundup_pow_of_two(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { @@ -2511,7 +2511,7 @@ xlog_abort_defer_ops( list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); } } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fb87ffb48f7f..aed5be5508fe 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -34,6 +34,7 @@ #include "xfs_health.h" #include "xfs_trace.h" #include "xfs_ag.h" +#include "scrub/stats.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -716,9 +717,11 @@ xfs_mountfs( if (error) goto out_remove_sysfs; + xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); + error = xfs_error_sysfs_init(mp); if (error) - goto out_del_stats; + goto out_remove_scrub_stats; error = xfs_errortag_init(mp); if (error) @@ -1018,7 +1021,7 @@ xfs_mountfs( out_log_dealloc: xfs_log_mount_cancel(mp); out_inodegc_shrinker: - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); @@ -1033,7 +1036,8 @@ xfs_mountfs( xfs_errortag_del(mp); out_remove_error_sysfs: xfs_error_sysfs_del(mp); - out_del_stats: + out_remove_scrub_stats: + xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: xfs_sysfs_del(&mp->m_kobj); @@ -1100,11 +1104,12 @@ xfs_unmountfs( #if defined(DEBUG) xfs_errortag_clearall(mp); #endif - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); xfs_free_perag(mp); xfs_errortag_del(mp); xfs_error_sysfs_del(mp); + xchk_stats_unregister(mp->m_scrub_stats); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e2866e7fa60c..503fe3c7edbf 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -60,6 +60,7 @@ struct xfs_error_cfg { * Per-cpu deferred inode inactivation GC lists. */ struct xfs_inodegc { + struct xfs_mount *mp; struct llist_head list; struct delayed_work work; int error; @@ -67,9 +68,7 @@ struct xfs_inodegc { /* approximate count of inodes in the list */ unsigned int items; unsigned int shrinker_hits; -#if defined(DEBUG) || defined(XFS_WARN) unsigned int cpu; -#endif }; /* @@ -98,14 +97,13 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - struct list_head m_mount_list; /* global mount list */ void __percpu *m_inodegc; /* percpu inodegc structures */ /* * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] <= the minimum i for which - * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip - * inode lock. + * invariant that m_rsum_cache[bbno] > the maximum i for which + * rsum[i][bbno] != 0, or 0 if rsum[i][bbno] == 0 for all i. + * Reads and writes are serialized by the rsumip inode lock. */ uint8_t *m_rsum_cache; struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ @@ -121,6 +119,7 @@ typedef struct xfs_mount { uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ + int8_t m_rtxblklog; /* log2 of rextsize, if possible */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -154,6 +153,7 @@ typedef struct xfs_mount { uint64_t m_features; /* active filesystem features */ uint64_t m_low_space[XFS_LOWSP_MAX]; uint64_t m_low_rtexts[XFS_LOWSP_MAX]; + uint64_t m_rtxblkmask; /* rt extent block mask */ struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */ @@ -208,16 +208,20 @@ typedef struct xfs_mount { uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ +#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS + struct xchk_stats *m_scrub_stats; +#endif xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ /* Memory shrinker to throttle and reprioritize inodegc */ - struct shrinker m_inodegc_shrinker; + struct shrinker *m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. @@ -245,6 +249,9 @@ typedef struct xfs_mount { unsigned int *m_errortag; struct xfs_kobj m_errortag_kobj; #endif + + /* cpus that have inodes queued for inactivation */ + struct cpumask m_inodegc_cpumask; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) @@ -400,6 +407,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_SHRINK 8 /* Kernel has logged a warning about logged xattr updates being used. */ #define XFS_OPSTATE_WARNED_LARP 9 +/* Mount time quotacheck is running */ +#define XFS_OPSTATE_QUOTACHECK_RUNNING 10 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -422,6 +431,11 @@ __XFS_IS_OPSTATE(inode32, INODE32) __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +#ifdef CONFIG_XFS_QUOTA +__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) +#else +# define xfs_is_quotacheck_running(mp) (false) +#endif static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -439,7 +453,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ - { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ + { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } /* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 4a9bbd3fe120..a7daa522e00f 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure( struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; struct xfs_agf *agf; - xfs_agblock_t agend; struct xfs_perag *pag; + xfs_agblock_t range_agend; pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); @@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure( ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); agf = agf_bp->b_addr; - agend = min(be32_to_cpu(agf->agf_length), + range_agend = min(be32_to_cpu(agf->agf_length) - 1, ri_high.rm_startblock); notify.startblock = ri_low.rm_startblock; - notify.blockcount = agend - ri_low.rm_startblock; + notify.blockcount = range_agend + 1 - ri_low.rm_startblock; error = xfs_rmap_query_range(cur, &ri_low, &ri_high, xfs_dax_failure_fn, ¬ify); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index c4cc99b70dd3..21a7e350b4c5 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -72,6 +72,10 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_map_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_local_t, 4); + /* realtime structures */ + XFS_CHECK_STRUCT_SIZE(union xfs_rtword_raw, 4); + XFS_CHECK_STRUCT_SIZE(union xfs_suminfo_raw, 4); + /* * m68k has problems with xfs_attr_leaf_name_remote_t, but we pad it to * 4 bytes anyway so it's not obviously a problem. Hence for the moment diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6abcc34fafd8..94a7932ac570 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -504,8 +504,7 @@ xfs_qm_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; struct xfs_qm_isolate isol; unsigned long freed; int error; @@ -539,8 +538,7 @@ xfs_qm_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; return list_lru_shrink_count(&qi->qi_lru, sc); } @@ -680,15 +678,18 @@ xfs_qm_init_quotainfo( if (XFS_IS_PQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); - qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; - qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; - qinf->qi_shrinker.seeks = DEFAULT_SEEKS; - qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - - error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", - mp->m_super->s_id); - if (error) + qinf->qi_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-qm:%s", + mp->m_super->s_id); + if (!qinf->qi_shrinker) { + error = -ENOMEM; goto out_free_inos; + } + + qinf->qi_shrinker->count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker->scan_objects = xfs_qm_shrink_scan; + qinf->qi_shrinker->private_data = qinf; + + shrinker_register(qinf->qi_shrinker); return 0; @@ -718,7 +719,7 @@ xfs_qm_destroy_quotainfo( qi = mp->m_quotainfo; ASSERT(qi != NULL); - unregister_shrinker(&qi->qi_shrinker); + shrinker_free(qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); @@ -1160,6 +1161,19 @@ xfs_qm_dqusage_adjust( if (error) return error; + /* + * Reload the incore unlinked list to avoid failure in inodegc. + * Use an unlocked check here because unrecovered unlinked inodes + * should be somewhat rare. + */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked(ip); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto error0; + } + } + ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { @@ -1173,6 +1187,7 @@ xfs_qm_dqusage_adjust( } nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; + xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); /* * Add the (disk blocks and inode) resources occupied by this @@ -1319,8 +1334,10 @@ xfs_qm_quotacheck( flags |= XFS_PQUOTA_CHKD; } + xfs_set_quotacheck_running(mp); error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); + xfs_clear_quotacheck_running(mp); /* * On error, the inode walk may have partially populated the dquot diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9683f0457d19..d5c9fc4ba591 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -63,7 +63,7 @@ struct xfs_quotainfo { struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; + struct shrinker *qi_shrinker; /* Minimum and maximum quota expiration timestamp values. */ time64_t qi_expiry_min; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index edd8587658d5..2d4444d61e98 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -477,6 +477,7 @@ xfs_cui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_cud_log_item *cudp; struct xfs_trans *tp; @@ -514,8 +515,9 @@ xfs_cui_item_recover( * doesn't fit. We need to reserve enough blocks to handle a * full btree split on either end of the refcount range. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index eb9102453aff..e5b62dc28466 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -784,6 +784,7 @@ xfs_reflink_end_cow_extent( } } del = got; + xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); /* Grab the corresponding mapping in the data fork. */ nmaps = 1; @@ -1540,6 +1541,10 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; + xfs_iflags_set(src, XFS_IREMAPPING); + if (inode_in != inode_out) + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + return 0; out_unlock: xfs_iunlock2_io_mmap(src, dest); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 520c7ebdfed8..0e0e747028da 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -507,6 +507,7 @@ xfs_rui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_rud_log_item *rudp; struct xfs_trans *tp; @@ -530,8 +531,9 @@ xfs_rui_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; rudp = xfs_trans_get_rud(tp, ruip); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 16534e9873f6..88c48de5c9c8 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -19,6 +19,7 @@ #include "xfs_icache.h" #include "xfs_rtalloc.h" #include "xfs_sb.h" +#include "xfs_rtbitmap.h" /* * Read and return the summary information for a given extent size, @@ -28,48 +29,48 @@ */ static int xfs_rtget_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { - return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum); + return xfs_rtmodify_summary_int(args, log, bbno, 0, sum); } /* * Return whether there are any free extents in the size range given * by low and high, for the bitmap block bbno. */ -STATIC int /* error */ +STATIC int xfs_rtany_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int low, /* low log2 extent size */ - int high, /* high log2 extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - int *stat) /* out: any good extents here? */ + struct xfs_rtalloc_args *args, + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + int *maxlog) /* out: max log2 extent size free */ { - int error; /* error value */ - int log; /* loop counter, log2 of ext. size */ - xfs_suminfo_t sum; /* summary data */ - - /* There are no extents at levels < m_rsum_cache[bbno]. */ - if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno]) - low = mp->m_rsum_cache[bbno]; + struct xfs_mount *mp = args->mp; + int error; + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* There are no extents at levels >= m_rsum_cache[bbno]. */ + if (mp->m_rsum_cache) { + high = min(high, mp->m_rsum_cache[bbno] - 1); + if (low > high) { + *maxlog = -1; + return 0; + } + } /* * Loop over logs of extent sizes. */ - for (log = low; log <= high; log++) { + for (log = high; log >= low; log--) { /* * Get one summary datum. */ - error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + error = xfs_rtget_summary(args, log, bbno, &sum); if (error) { return error; } @@ -77,18 +78,18 @@ xfs_rtany_summary( * If there are any, return success. */ if (sum) { - *stat = 1; + *maxlog = log; goto out; } } /* * Found nothing, return failure. */ - *stat = 0; + *maxlog = -1; out: - /* There were no extents at levels < log. */ - if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; + /* There were no extents at levels > log. */ + if (mp->m_rsum_cache && log + 1 < mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; return 0; } @@ -97,60 +98,54 @@ out: * Copy and transform the summary file, given the old and new * parameters in the mount structures. */ -STATIC int /* error */ +STATIC int xfs_rtcopy_summary( - xfs_mount_t *omp, /* old file system mount point */ - xfs_mount_t *nmp, /* new file system mount point */ - xfs_trans_t *tp) /* transaction pointer */ + struct xfs_rtalloc_args *oargs, + struct xfs_rtalloc_args *nargs) { - xfs_rtblock_t bbno; /* bitmap block number */ - struct xfs_buf *bp; /* summary buffer */ - int error; /* error return value */ - int log; /* summary level number (log length) */ - xfs_suminfo_t sum; /* summary data */ - xfs_fsblock_t sumbno; /* summary block number */ + xfs_fileoff_t bbno; /* bitmap block number */ + int error; + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ - bp = NULL; - for (log = omp->m_rsumlevels - 1; log >= 0; log--) { - for (bbno = omp->m_sb.sb_rbmblocks - 1; + for (log = oargs->mp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = oargs->mp->m_sb.sb_rbmblocks - 1; (xfs_srtblock_t)bbno >= 0; bbno--) { - error = xfs_rtget_summary(omp, tp, log, bbno, &bp, - &sumbno, &sum); + error = xfs_rtget_summary(oargs, log, bbno, &sum); if (error) - return error; + goto out; if (sum == 0) continue; - error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, - &bp, &sumbno); + error = xfs_rtmodify_summary(oargs, log, bbno, -sum); if (error) - return error; - error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, - &bp, &sumbno); + goto out; + error = xfs_rtmodify_summary(nargs, log, bbno, sum); if (error) - return error; + goto out; ASSERT(sum > 0); } } + error = 0; +out: + xfs_rtbuf_cache_relse(oargs); return 0; } /* * Mark an extent specified by start and len allocated. * Updates all the summary information as well as the bitmap. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* start block to allocate */ - xfs_extlen_t len, /* length to allocate */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* start rtext to allocate */ + xfs_rtxlen_t len) /* in/out: summary block number */ { - xfs_rtblock_t end; /* end of the allocated extent */ - int error; /* error value */ - xfs_rtblock_t postblock = 0; /* first block allocated > end */ - xfs_rtblock_t preblock = 0; /* first block allocated < start */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t end; /* end of the allocated rtext */ + int error; + xfs_rtxnum_t postblock = 0; /* first rtext allocated > end */ + xfs_rtxnum_t preblock = 0; /* first rtext allocated < start */ end = start + len - 1; /* @@ -158,15 +153,15 @@ xfs_rtallocate_range( * We need to find the beginning and end of the extent so we can * properly update the summary. */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + error = xfs_rtfind_back(args, start, 0, &preblock); if (error) { return error; } /* * Find the next allocated block (end of free extent). */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); + error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, + &postblock); if (error) { return error; } @@ -174,9 +169,9 @@ xfs_rtallocate_range( * Decrement the summary information corresponding to the entire * (old) free extent. */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; } @@ -185,9 +180,9 @@ xfs_rtallocate_range( * old extent, add summary data for them to be free. */ if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(start - preblock), + xfs_rtx_to_rbmblock(mp, preblock), 1); if (error) { return error; } @@ -197,9 +192,9 @@ xfs_rtallocate_range( * old extent, add summary data for them to be free. */ if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + error = xfs_rtmodify_summary(args, + XFS_RTBLOCKLOG(postblock - end), + xfs_rtx_to_rbmblock(mp, end + 1), 1); if (error) { return error; } @@ -207,54 +202,69 @@ xfs_rtallocate_range( /* * Modify the bitmap to mark this extent allocated. */ - error = xfs_rtmodify_range(mp, tp, start, len, 0); + error = xfs_rtmodify_range(args, start, len, 0); return error; } /* + * Make sure we don't run off the end of the rt volume. Be careful that + * adjusting maxlen downwards doesn't cause us to fail the alignment checks. + */ +static inline xfs_rtxlen_t +xfs_rtallocate_clamp_len( + struct xfs_mount *mp, + xfs_rtxnum_t startrtx, + xfs_rtxlen_t rtxlen, + xfs_rtxlen_t prod) +{ + xfs_rtxlen_t ret; + + ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx; + return rounddown(ret, prod); +} + +/* * Attempt to allocate an extent minlen<=len<=maxlen starting from * bitmap block bbno. If we don't get maxlen then use prod to trim - * the length, if given. Returns error; returns starting block in *rtblock. + * the length, if given. Returns error; returns starting block in *rtx. * The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_block( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - xfs_rtblock_t *nextp, /* out: next block to try */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_fileoff_t bbno, /* bitmap block number */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t *nextp, /* out: next rtext to try */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - xfs_rtblock_t besti; /* best rtblock found so far */ - xfs_rtblock_t bestlen; /* best length found so far */ - xfs_rtblock_t end; /* last rtblock in chunk */ - int error; /* error value */ - xfs_rtblock_t i; /* current rtblock trying */ - xfs_rtblock_t next; /* next rtblock to try */ - int stat; /* status from internal calls */ + struct xfs_mount *mp = args->mp; + xfs_rtxnum_t besti; /* best rtext found so far */ + xfs_rtxnum_t bestlen;/* best length found so far */ + xfs_rtxnum_t end; /* last rtext in chunk */ + int error; + xfs_rtxnum_t i; /* current rtext trying */ + xfs_rtxnum_t next; /* next rtext to try */ + int stat; /* status from internal calls */ /* * Loop over all the extents starting in this bitmap block, * looking for one that's long enough. */ - for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0, - end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; + for (i = xfs_rbmblock_to_rtx(mp, bbno), besti = -1, bestlen = 0, + end = xfs_rbmblock_to_rtx(mp, bbno + 1) - 1; i <= end; i++) { /* Make sure we don't scan off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. */ - error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat); + error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); if (error) { return error; } @@ -262,13 +272,12 @@ xfs_rtallocate_extent_block( /* * i for maxlen is all free, allocate and return that. */ - error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp, - rsb); + error = xfs_rtallocate_range(args, i, maxlen); if (error) { return error; } *len = maxlen; - *rtblock = i; + *rtx = i; return 0; } /* @@ -278,7 +287,7 @@ xfs_rtallocate_extent_block( * so far, remember it. */ if (minlen < maxlen) { - xfs_rtblock_t thislen; /* this extent size */ + xfs_rtxnum_t thislen; /* this extent size */ thislen = next - i; if (thislen >= minlen && thislen > bestlen) { @@ -290,7 +299,7 @@ xfs_rtallocate_extent_block( * If not done yet, find the start of the next free space. */ if (next < end) { - error = xfs_rtfind_forw(mp, tp, next, end, &i); + error = xfs_rtfind_forw(args, next, end, &i); if (error) { return error; } @@ -301,7 +310,7 @@ xfs_rtallocate_extent_block( * Searched the whole thing & didn't find a maxlen free extent. */ if (minlen < maxlen && besti != -1) { - xfs_extlen_t p; /* amount to trim length by */ + xfs_rtxlen_t p; /* amount to trim length by */ /* * If size should be a multiple of prod, make that so. @@ -315,51 +324,49 @@ xfs_rtallocate_extent_block( /* * Allocate besti for bestlen & return that. */ - error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb); + error = xfs_rtallocate_range(args, besti, bestlen); if (error) { return error; } *len = bestlen; - *rtblock = besti; + *rtx = besti; return 0; } /* * Allocation failed. Set *nextp to the next block to try. */ *nextp = next; - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } /* * Allocate an extent of length minlen<=len<=maxlen, starting at block * bno. If we don't get maxlen then use prod to trim the length, if given. - * Returns error; returns starting block in *rtblock. + * Returns error; returns starting block in *rtx. * The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_exact( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int error; /* error value */ - xfs_extlen_t i; /* extent length trimmed due to prod */ - int isfree; /* extent is free */ - xfs_rtblock_t next; /* next block to try (dummy) */ + int error; + xfs_rtxlen_t i; /* extent length trimmed due to prod */ + int isfree; /* extent is free */ + xfs_rtxnum_t next; /* next rtext to try (dummy) */ - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); /* * Check if the range in question (for maxlen) is free. */ - error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree); + error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree); if (error) { return error; } @@ -367,23 +374,23 @@ xfs_rtallocate_extent_exact( /* * If it is, allocate it and return success. */ - error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + error = xfs_rtallocate_range(args, start, maxlen); if (error) { return error; } *len = maxlen; - *rtblock = bno; + *rtx = start; return 0; } /* * If not, allocate what there is, if it's at least minlen. */ - maxlen = next - bno; + maxlen = next - start; if (maxlen < minlen) { /* * Failed, return failure status. */ - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } /* @@ -395,81 +402,82 @@ xfs_rtallocate_extent_exact( /* * Now we can't do it, return failure status. */ - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } } /* * Allocate what we can and return it. */ - error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb); + error = xfs_rtallocate_range(args, start, maxlen); if (error) { return error; } *len = maxlen; - *rtblock = bno; + *rtx = start; return 0; } /* * Allocate an extent of length minlen<=len<=maxlen, starting as near - * to bno as possible. If we don't get maxlen then use prod to trim + * to start as possible. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_near( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int any; /* any useful extents from summary */ - xfs_rtblock_t bbno; /* bitmap block number */ - int error; /* error value */ - int i; /* bitmap block offset (loop control) */ - int j; /* secondary loop control */ - int log2len; /* log2 of minlen */ - xfs_rtblock_t n; /* next block to try */ - xfs_rtblock_t r; /* result block */ - - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + struct xfs_mount *mp = args->mp; + int maxlog; /* max useful extent from summary */ + xfs_fileoff_t bbno; /* bitmap block number */ + int error; + int i; /* bitmap block offset (loop control) */ + int j; /* secondary loop control */ + int log2len; /* log2 of minlen */ + xfs_rtxnum_t n; /* next rtext to try */ + xfs_rtxnum_t r; /* result rtext */ + + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); + /* * If the block number given is off the end, silently set it to * the last block. */ - if (bno >= mp->m_sb.sb_rextents) - bno = mp->m_sb.sb_rextents - 1; + if (start >= mp->m_sb.sb_rextents) + start = mp->m_sb.sb_rextents - 1; /* Make sure we don't run off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); if (maxlen < minlen) { - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } /* * Try the exact allocation first. */ - error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len, - rbpp, rsb, prod, &r); + error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len, + prod, &r); if (error) { return error; } /* * If the exact allocation worked, return that. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } - bbno = XFS_BITTOBLOCK(mp, bno); + bbno = xfs_rtx_to_rbmblock(mp, start); i = 0; + j = -1; ASSERT(minlen != 0); log2len = xfs_highbit32(minlen); /* @@ -480,8 +488,8 @@ xfs_rtallocate_extent_near( * Get summary information of extents of all useful levels * starting in this bitmap block. */ - error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1, - bbno + i, rbpp, rsb, &any); + error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1, + bbno + i, &maxlog); if (error) { return error; } @@ -489,7 +497,10 @@ xfs_rtallocate_extent_near( * If there are any useful extents starting here, try * allocating one. */ - if (any) { + if (maxlog >= 0) { + xfs_extlen_t maxavail = + min_t(xfs_rtblock_t, maxlen, + (1ULL << (maxlog + 1)) - 1); /* * On the positive side of the starting location. */ @@ -498,17 +509,17 @@ xfs_rtallocate_extent_near( * Try to allocate an extent starting in * this block. */ - error = xfs_rtallocate_extent_block(mp, tp, - bbno + i, minlen, maxlen, len, &n, rbpp, - rsb, prod, &r); + error = xfs_rtallocate_extent_block(args, + bbno + i, minlen, maxavail, len, + &n, prod, &r); if (error) { return error; } /* * If it worked, return it. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } } @@ -516,68 +527,46 @@ xfs_rtallocate_extent_near( * On the negative side of the starting location. */ else { /* i < 0 */ + int maxblocks; + /* - * Loop backwards through the bitmap blocks from - * the starting point-1 up to where we are now. - * There should be an extent which ends in this - * bitmap block and is long enough. + * Loop backwards to find the end of the extent + * we found in the realtime summary. + * + * maxblocks is the maximum possible number of + * bitmap blocks from the start of the extent + * to the end of the extent. */ - for (j = -1; j > i; j--) { - /* - * Grab the summary information for - * this bitmap block. - */ - error = xfs_rtany_summary(mp, tp, - log2len, mp->m_rsumlevels - 1, - bbno + j, rbpp, rsb, &any); - if (error) { - return error; - } - /* - * If there's no extent given in the - * summary that means the extent we - * found must carry over from an - * earlier block. If there is an - * extent given, we've already tried - * that allocation, don't do it again. - */ - if (any) - continue; - error = xfs_rtallocate_extent_block(mp, - tp, bbno + j, minlen, maxlen, - len, &n, rbpp, rsb, prod, &r); + if (maxlog == 0) + maxblocks = 0; + else if (maxlog < mp->m_blkbit_log) + maxblocks = 1; + else + maxblocks = 2 << (maxlog - mp->m_blkbit_log); + + /* + * We need to check bbno + i + maxblocks down to + * bbno + i. We already checked bbno down to + * bbno + j + 1, so we don't need to check those + * again. + */ + j = min(i + maxblocks, j); + for (; j >= i; j--) { + error = xfs_rtallocate_extent_block(args, + bbno + j, minlen, + maxavail, len, &n, prod, + &r); if (error) { return error; } /* * If it works, return the extent. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } } - /* - * There weren't intervening bitmap blocks - * with a long enough extent, or the - * allocation didn't work for some reason - * (i.e. it's a little * too short). - * Try to allocate from the summary block - * that we found. - */ - error = xfs_rtallocate_extent_block(mp, tp, - bbno + i, minlen, maxlen, len, &n, rbpp, - rsb, prod, &r); - if (error) { - return error; - } - /* - * If it works, return the extent. - */ - if (r != NULLRTBLOCK) { - *rtblock = r; - return 0; - } } } /* @@ -610,7 +599,7 @@ xfs_rtallocate_extent_near( else break; } - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } @@ -619,26 +608,25 @@ xfs_rtallocate_extent_near( * specified. If we don't get maxlen then use prod to trim * the length, if given. The lengths are all in rtextents. */ -STATIC int /* error */ +STATIC int xfs_rtallocate_extent_size( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - struct xfs_buf **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_rtalloc_args *args, + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - int error; /* error value */ - int i; /* bitmap block number */ - int l; /* level number (loop control) */ - xfs_rtblock_t n; /* next block to be tried */ - xfs_rtblock_t r; /* result block number */ - xfs_suminfo_t sum; /* summary information for extents */ - - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + struct xfs_mount *mp = args->mp; + int error; + xfs_fileoff_t i; /* bitmap block number */ + int l; /* level number (loop control) */ + xfs_rtxnum_t n; /* next rtext to be tried */ + xfs_rtxnum_t r; /* result rtext number */ + xfs_suminfo_t sum; /* summary information for extents */ + + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); ASSERT(maxlen != 0); /* @@ -656,8 +644,7 @@ xfs_rtallocate_extent_size( /* * Get the summary for this level/block. */ - error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, - &sum); + error = xfs_rtget_summary(args, l, i, &sum); if (error) { return error; } @@ -669,16 +656,16 @@ xfs_rtallocate_extent_size( /* * Try allocating the extent. */ - error = xfs_rtallocate_extent_block(mp, tp, i, maxlen, - maxlen, len, &n, rbpp, rsb, prod, &r); + error = xfs_rtallocate_extent_block(args, i, maxlen, + maxlen, len, &n, prod, &r); if (error) { return error; } /* * If it worked, return that. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } /* @@ -686,8 +673,8 @@ xfs_rtallocate_extent_size( * allocator is beyond the next bitmap block, * skip to that bitmap block. */ - if (XFS_BITTOBLOCK(mp, n) > i + 1) - i = XFS_BITTOBLOCK(mp, n) - 1; + if (xfs_rtx_to_rbmblock(mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(mp, n) - 1; } } /* @@ -695,7 +682,7 @@ xfs_rtallocate_extent_size( * we're asking for a fixed size extent. */ if (minlen > --maxlen) { - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } ASSERT(minlen != 0); @@ -715,8 +702,7 @@ xfs_rtallocate_extent_size( /* * Get the summary information for this level/block. */ - error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb, - &sum); + error = xfs_rtget_summary(args, l, i, &sum); if (error) { return error; } @@ -730,18 +716,18 @@ xfs_rtallocate_extent_size( * minlen/maxlen are in the possible range for * this summary level. */ - error = xfs_rtallocate_extent_block(mp, tp, i, + error = xfs_rtallocate_extent_block(args, i, XFS_RTMAX(minlen, 1 << l), XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), - len, &n, rbpp, rsb, prod, &r); + len, &n, prod, &r); if (error) { return error; } /* * If it worked, return that extent. */ - if (r != NULLRTBLOCK) { - *rtblock = r; + if (r != NULLRTEXTNO) { + *rtx = r; return 0; } /* @@ -749,14 +735,14 @@ xfs_rtallocate_extent_size( * allocator is beyond the next bitmap block, * skip to that bitmap block. */ - if (XFS_BITTOBLOCK(mp, n) > i + 1) - i = XFS_BITTOBLOCK(mp, n) - 1; + if (xfs_rtx_to_rbmblock(mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(mp, n) - 1; } } /* * Got nothing, return failure. */ - *rtblock = NULLRTBLOCK; + *rtx = NULLRTEXTNO; return 0; } @@ -886,12 +872,14 @@ xfs_alloc_rsum_cache( xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */ { /* - * The rsum cache is initialized to all zeroes, which is trivially a - * lower bound on the minimum level with any free extents. We can - * continue without the cache if it couldn't be allocated. + * The rsum cache is initialized to the maximum value, which is + * trivially an upper bound on the maximum level with any free extents. + * We can continue without the cache if it couldn't be allocated. */ - mp->m_rsum_cache = kvzalloc(rbmblocks, GFP_KERNEL); - if (!mp->m_rsum_cache) + mp->m_rsum_cache = kvmalloc(rbmblocks, GFP_KERNEL); + if (mp->m_rsum_cache) + memset(mp->m_rsum_cache, -1, rbmblocks); + else xfs_warn(mp, "could not allocate realtime summary cache"); } @@ -907,13 +895,13 @@ xfs_growfs_rt( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_rt_t *in) /* growfs rt input struct */ { - xfs_rtblock_t bmbno; /* bitmap block number */ + xfs_fileoff_t bmbno; /* bitmap block number */ struct xfs_buf *bp; /* temporary buffer */ int error; /* error return value */ xfs_mount_t *nmp; /* new (fake) mount structure */ xfs_rfsblock_t nrblocks; /* new number of realtime blocks */ xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ - xfs_rtblock_t nrextents; /* new number of realtime extents */ + xfs_rtxnum_t nrextents; /* new number of realtime extents */ uint8_t nrextslog; /* new log2 of sb_rextents */ xfs_extlen_t nrsumblocks; /* new number of summary blocks */ uint nrsumlevels; /* new rt summary levels */ @@ -922,7 +910,6 @@ xfs_growfs_rt( xfs_extlen_t rbmblocks; /* current number of rt bitmap blocks */ xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ - xfs_fsblock_t sumbno; /* summary block number */ uint8_t *rsum_cache; /* old summary cache */ sbp = &mp->m_sb; @@ -954,7 +941,7 @@ xfs_growfs_rt( return -EINVAL; /* Unsupported realtime features. */ - if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) + if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) return -EOPNOTSUPP; nrblocks = in->newblocks; @@ -976,11 +963,10 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); - nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); + nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); nrextslog = xfs_highbit32(nrextents); nrsumlevels = nrextslog + 1; - nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; - nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); /* * New summary size can't be more than half the size of @@ -1023,6 +1009,12 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { + struct xfs_rtalloc_args args = { + .mp = mp, + }; + struct xfs_rtalloc_args nargs = { + .mp = nmp, + }; struct xfs_trans *tp; xfs_rfsblock_t nrblocks_step; @@ -1032,19 +1024,17 @@ xfs_growfs_rt( * Calculate new sb and mount fields for this round. */ nsbp->sb_rextsize = in->extsize; + nmp->m_rtxblklog = -1; /* don't use shift or masking */ nsbp->sb_rbmblocks = bmbno + 1; nrblocks_step = (bmbno + 1) * NBBY * nsbp->sb_blocksize * nsbp->sb_rextsize; nsbp->sb_rblocks = min(nrblocks, nrblocks_step); - nsbp->sb_rextents = nsbp->sb_rblocks; - do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); ASSERT(nsbp->sb_rextents != 0); nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; - nrsumsize = - (uint)sizeof(xfs_suminfo_t) * nrsumlevels * - nsbp->sb_rbmblocks; - nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize); + nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, + nsbp->sb_rbmblocks); nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); /* * Start a transaction, get the log reservation. @@ -1053,6 +1043,9 @@ xfs_growfs_rt( &tp); if (error) break; + args.tp = tp; + nargs.tp = tp; + /* * Lock out other callers by grabbing the bitmap inode lock. */ @@ -1086,7 +1079,7 @@ xfs_growfs_rt( */ if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks || mp->m_rsumlevels != nmp->m_rsumlevels) { - error = xfs_rtcopy_summary(mp, nmp, tp); + error = xfs_rtcopy_summary(&args, &nargs); if (error) goto error_cancel; } @@ -1111,9 +1104,9 @@ xfs_growfs_rt( /* * Free new extent. */ - bp = NULL; - error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, - nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); + error = xfs_rtfree_range(&nargs, sbp->sb_rextents, + nsbp->sb_rextents - sbp->sb_rextents); + xfs_rtbuf_cache_relse(&nargs); if (error) { error_cancel: xfs_trans_cancel(tp); @@ -1171,59 +1164,60 @@ out_free: * parameters. The length units are all in realtime extents, as is the * result block number. */ -int /* error */ +int xfs_rtallocate_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock) /* out: start block allocated */ + struct xfs_trans *tp, + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + int wasdel, /* was a delayed allocation extent */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtblock) /* out: start rtext allocated */ { - xfs_mount_t *mp = tp->t_mountp; - int error; /* error value */ - xfs_rtblock_t r; /* result allocated block */ - xfs_fsblock_t sb; /* summary file block number */ - struct xfs_buf *sumbp; /* summary file block buffer */ - - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + struct xfs_rtalloc_args args = { + .mp = tp->t_mountp, + .tp = tp, + }; + int error; /* error value */ + xfs_rtxnum_t r; /* result allocated rtext */ + + ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL)); ASSERT(minlen > 0 && minlen <= maxlen); /* * If prod is set then figure out what to do to minlen and maxlen. */ if (prod > 1) { - xfs_extlen_t i; + xfs_rtxlen_t i; if ((i = maxlen % prod)) maxlen -= i; if ((i = minlen % prod)) minlen += prod - i; if (maxlen < minlen) { - *rtblock = NULLRTBLOCK; + *rtblock = NULLRTEXTNO; return 0; } } retry: - sumbp = NULL; - if (bno == 0) { - error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len, - &sumbp, &sb, prod, &r); + if (start == 0) { + error = xfs_rtallocate_extent_size(&args, minlen, + maxlen, len, prod, &r); } else { - error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen, - len, &sumbp, &sb, prod, &r); + error = xfs_rtallocate_extent_near(&args, start, minlen, + maxlen, len, prod, &r); } + xfs_rtbuf_cache_relse(&args); if (error) return error; /* * If it worked, update the superblock. */ - if (r != NULLRTBLOCK) { + if (r != NULLRTEXTNO) { long slen = (long)*len; ASSERT(*len >= minlen && *len <= maxlen); @@ -1250,6 +1244,7 @@ xfs_rtmount_init( struct xfs_buf *bp; /* buffer for last block of subvolume */ struct xfs_sb *sbp; /* filesystem superblock copy in mount */ xfs_daddr_t d; /* address of last block of subvolume */ + unsigned int rsumblocks; int error; sbp = &mp->m_sb; @@ -1261,10 +1256,9 @@ xfs_rtmount_init( return -ENODEV; } mp->m_rsumlevels = sbp->sb_rextslog + 1; - mp->m_rsumsize = - (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels * - sbp->sb_rbmblocks; - mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize); + rsumblocks = xfs_rtsummary_blockcount(mp, mp->m_rsumlevels, + mp->m_sb.sb_rbmblocks); + mp->m_rsumsize = XFS_FSB_TO_B(mp, rsumblocks); mp->m_rbmip = mp->m_rsumip = NULL; /* * Check that the realtime section is an ok size. @@ -1418,27 +1412,28 @@ xfs_rtunmount_inodes( * of rtextents and the fraction. * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... */ -int /* error */ +int /* error */ xfs_rtpick_extent( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick) /* result rt extent */ + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtxlen_t len, /* allocation length (rtextents) */ + xfs_rtxnum_t *pick) /* result rt extent */ { - xfs_rtblock_t b; /* result block */ - int log2; /* log of sequence number */ - uint64_t resid; /* residual after log removed */ - uint64_t seq; /* sequence number of file creation */ - uint64_t *seqp; /* pointer to seqno in inode */ + xfs_rtxnum_t b; /* result rtext */ + int log2; /* log of sequence number */ + uint64_t resid; /* residual after log removed */ + uint64_t seq; /* sequence number of file creation */ + struct timespec64 ts; /* timespec in inode */ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; + ts = inode_get_atime(VFS_I(mp->m_rbmip)); if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - *seqp = 0; + seq = 0; + } else { + seq = ts.tv_sec; } - seq = *seqp; if ((log2 = xfs_highbit64(seq)) == -1) b = 0; else { @@ -1450,7 +1445,8 @@ xfs_rtpick_extent( if (b + len > mp->m_sb.sb_rextents) b = mp->m_sb.sb_rextents - len; } - *seqp = seq + 1; + ts.tv_sec = seq + 1; + inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); *pick = b; return 0; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 62c7ad79cbb6..f7cb9ffe51ca 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -11,22 +11,6 @@ struct xfs_mount; struct xfs_trans; -/* - * XXX: Most of the realtime allocation functions deal in units of realtime - * extents, not realtime blocks. This looks funny when paired with the type - * name and screams for a larger cleanup. - */ -struct xfs_rtalloc_rec { - xfs_rtblock_t ar_startext; - xfs_rtblock_t ar_extcount; -}; - -typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_mount *mp, - struct xfs_trans *tp, - const struct xfs_rtalloc_rec *rec, - void *priv); - #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -40,23 +24,14 @@ typedef int (*xfs_rtalloc_query_range_fn)( int /* error */ xfs_rtallocate_extent( struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to allocate */ - xfs_extlen_t minlen, /* minimum length to allocate */ - xfs_extlen_t maxlen, /* maximum length to allocate */ - xfs_extlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t start, /* starting rtext number to allocate */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t *len, /* out: actual length allocated */ int wasdel, /* was a delayed allocation extent */ - xfs_extlen_t prod, /* extent product factor */ - xfs_rtblock_t *rtblock); /* out: start block allocated */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxnum_t *rtblock); /* out: start rtext allocated */ -/* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len); /* length of extent freed */ /* * Initialize realtime fields in the mount structure. @@ -87,8 +62,8 @@ int /* error */ xfs_rtpick_extent( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick); /* result rt extent */ + xfs_rtxlen_t len, /* allocation length (rtextents) */ + xfs_rtxnum_t *pick); /* result rt extent */ /* * Grow the realtime area of the filesystem. @@ -98,55 +73,12 @@ xfs_growfs_rt( struct xfs_mount *mp, /* file system mount structure */ xfs_growfs_rt_t *in); /* user supplied growfs struct */ -/* - * From xfs_rtbitmap.c - */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); -int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, int val, - xfs_rtblock_t *new, int *stat); -int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_rtblock_t limit, - xfs_rtblock_t *rtblock); -int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_rtblock_t limit, - xfs_rtblock_t *rtblock); -int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, int val); -int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp, - int log, xfs_rtblock_t bbno, int delta, - struct xfs_buf **rbpp, xfs_fsblock_t *rsb, - xfs_suminfo_t *sum); -int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, - xfs_rtblock_t bbno, int delta, struct xfs_buf **rbpp, - xfs_fsblock_t *rsb); -int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, - struct xfs_buf **rbpp, xfs_fsblock_t *rsb); -int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp, - const struct xfs_rtalloc_rec *low_rec, - const struct xfs_rtalloc_rec *high_rec, - xfs_rtalloc_query_range_fn fn, void *priv); -int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtalloc_query_range_fn fn, - void *priv); -bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); -int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t start, xfs_extlen_t len, - bool *is_free); int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else -# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) -# define xfs_rtfree_extent(t,b,l) (ENOSYS) -# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) -# define xfs_growfs_rt(mp,in) (ENOSYS) -# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) -# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) -# define xfs_verify_rtbno(m, r) (false) -# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) -# define xfs_rtalloc_reinit_frextents(m) (0) +# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS) +# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS) +# define xfs_growfs_rt(mp,in) (-ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ @@ -157,7 +89,7 @@ xfs_rtmount_init( xfs_warn(mp, "Not built with CONFIG_XFS_RT"); return -ENOSYS; } -# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 818510243130..764304595e8b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -42,6 +42,8 @@ #include "xfs_xattr.h" #include "xfs_iunlink_item.h" #include "xfs_dahash_test.h" +#include "xfs_rtbitmap.h" +#include "scrub/stats.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -49,33 +51,12 @@ static const struct super_operations xfs_super_operations; +static struct dentry *xfs_debugfs; /* top-level xfs debugfs dir */ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif -#ifdef CONFIG_HOTPLUG_CPU -static LIST_HEAD(xfs_mount_list); -static DEFINE_SPINLOCK(xfs_mount_list_lock); - -static inline void xfs_mount_list_add(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_add(&mp->m_mount_list, &xfs_mount_list); - spin_unlock(&xfs_mount_list_lock); -} - -static inline void xfs_mount_list_del(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_del(&mp->m_mount_list); - spin_unlock(&xfs_mount_list_lock); -} -#else /* !CONFIG_HOTPLUG_CPU */ -static inline void xfs_mount_list_add(struct xfs_mount *mp) {} -static inline void xfs_mount_list_del(struct xfs_mount *mp) {} -#endif - enum xfs_dax_mode { XFS_DAX_INODE = 0, XFS_DAX_ALWAYS = 1, @@ -377,29 +358,19 @@ disable_dax: return 0; } -static void -xfs_bdev_mark_dead( - struct block_device *bdev) -{ - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); -} - -static const struct blk_holder_ops xfs_holder_ops = { - .mark_dead = xfs_bdev_mark_dead, -}; - STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, - struct block_device **bdevp) + struct bdev_handle **handlep) { int error = 0; - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, - &xfs_holder_ops); - if (IS_ERR(*bdevp)) { - error = PTR_ERR(*bdevp); + *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, + mp->m_super, &fs_holder_ops); + if (IS_ERR(*handlep)) { + error = PTR_ERR(*handlep); + *handlep = NULL; xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } @@ -407,31 +378,45 @@ xfs_blkdev_get( } STATIC void -xfs_blkdev_put( - struct xfs_mount *mp, - struct block_device *bdev) -{ - if (bdev) - blkdev_put(bdev, mp); -} - -STATIC void -xfs_close_devices( +xfs_shutdown_devices( struct xfs_mount *mp) { + /* + * Udev is triggered whenever anyone closes a block device or unmounts + * a file systemm on a block device. + * The default udev rules invoke blkid to read the fs super and create + * symlinks to the bdev under /dev/disk. For this, it uses buffered + * reads through the page cache. + * + * xfs_db also uses buffered reads to examine metadata. There is no + * coordination between xfs_db and udev, which means that they can run + * concurrently. Note there is no coordination between the kernel and + * blkid either. + * + * On a system with 64k pages, the page cache can cache the superblock + * and the root inode (and hence the root directory) with the same 64k + * page. If udev spawns blkid after the mkfs and the system is busy + * enough that it is still running when xfs_db starts up, they'll both + * read from the same page in the pagecache. + * + * The unmount writes updated inode metadata to disk directly. The XFS + * buffer cache does not use the bdev pagecache, so it needs to + * invalidate that pagecache on unmount. If the above scenario occurs, + * the pagecache no longer reflects what's on disk, xfs_db reads the + * stale metadata, and fails to find /a. Most of the time this succeeds + * because closing a bdev invalidates the page cache, but when processes + * race, everyone loses. + */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { - struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(mp, logdev); + blkdev_issue_flush(mp->m_logdev_targp->bt_bdev); + invalidate_bdev(mp->m_logdev_targp->bt_bdev); } if (mp->m_rtdev_targp) { - struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(mp, rtdev); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); } - xfs_free_buftarg(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + invalidate_bdev(mp->m_ddev_targp->bt_bdev); } /* @@ -448,25 +433,34 @@ STATIC int xfs_open_devices( struct xfs_mount *mp) { - struct block_device *ddev = mp->m_super->s_bdev; - struct block_device *logdev = NULL, *rtdev = NULL; + struct super_block *sb = mp->m_super; + struct block_device *ddev = sb->s_bdev; + struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL; int error; /* + * blkdev_put() can't be called under s_umount, see the comment + * in get_tree_bdev() for more details + */ + up_write(&sb->s_umount); + + /* * Open real time and log devices - order is important. */ if (mp->m_logname) { - error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); if (error) - return error; + goto out_relock; } if (mp->m_rtname) { - error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle); if (error) goto out_close_logdev; - if (rtdev == ddev || rtdev == logdev) { + if (rtdev_handle->bdev == ddev || + (logdev_handle && + rtdev_handle->bdev == logdev_handle->bdev)) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = -EINVAL; @@ -478,25 +472,31 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle); if (!mp->m_ddev_targp) goto out_close_rtdev; - if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + if (rtdev_handle) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } - if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + if (logdev_handle && logdev_handle->bdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { mp->m_logdev_targp = mp->m_ddev_targp; + /* Handle won't be used, drop it */ + if (logdev_handle) + bdev_release(logdev_handle); } - return 0; + error = 0; +out_relock: + down_write(&sb->s_umount); + return error; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -504,11 +504,12 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(mp, rtdev); + if (rtdev_handle) + bdev_release(rtdev_handle); out_close_logdev: - if (logdev && logdev != ddev) - xfs_blkdev_put(mp, logdev); - return error; + if (logdev_handle) + bdev_release(logdev_handle); + goto out_relock; } /* @@ -758,6 +759,18 @@ static void xfs_mount_free( struct xfs_mount *mp) { + /* + * Free the buftargs here because blkdev_put needs to be called outside + * of sb->s_umount, which is held around the call to ->put_super. + */ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_logdev_targp); + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + if (mp->m_ddev_targp) + xfs_free_buftarg(mp->m_ddev_targp); + + debugfs_remove(mp->m_debugfs); kfree(mp->m_rtname); kfree(mp->m_logname); kmem_free(mp); @@ -884,7 +897,7 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_rblocks; freertx = percpu_counter_sum_positive(&mp->m_frextents); - statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize; + statp->f_bavail = statp->f_bfree = xfs_rtx_to_rtb(mp, freertx); } return 0; @@ -1107,9 +1120,8 @@ xfs_inodegc_init_percpu( for_each_possible_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); -#if defined(DEBUG) || defined(XFS_WARN) gc->cpu = cpu; -#endif + gc->mp = mp; init_llist_head(&gc->list); gc->items = 0; gc->error = 0; @@ -1133,24 +1145,17 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - /* if ->fill_super failed, we have no mount to tear down */ - if (!sb->s_fs_info) - return; - xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); xfs_freesb(mp); + xchk_mount_stats_free(mp); free_percpu(mp->m_stats.xs_stats); - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - - sb->s_fs_info = NULL; - xfs_mount_free(mp); + xfs_shutdown_devices(mp); } static long @@ -1479,6 +1484,21 @@ xfs_fs_validate_params( return 0; } +struct dentry * +xfs_debugfs_mkdir( + const char *name, + struct dentry *parent) +{ + struct dentry *child; + + /* Apparently we're expected to ignore error returns?? */ + child = debugfs_create_dir(name, parent); + if (IS_ERR(child)) + return NULL; + + return child; +} + static int xfs_fs_fill_super( struct super_block *sb, @@ -1492,7 +1512,7 @@ xfs_fs_fill_super( error = xfs_fs_validate_params(mp); if (error) - goto out_free_names; + return error; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1519,11 +1539,18 @@ xfs_fs_fill_super( error = xfs_open_devices(mp); if (error) - goto out_free_names; + return error; + + if (xfs_debugfs) { + mp->m_debugfs = xfs_debugfs_mkdir(mp->m_super->s_id, + xfs_debugfs); + } else { + mp->m_debugfs = NULL; + } error = xfs_init_mount_workqueues(mp); if (error) - goto out_close_devices; + goto out_shutdown_devices; error = xfs_init_percpu_counters(mp); if (error) @@ -1533,13 +1560,6 @@ xfs_fs_fill_super( if (error) goto out_destroy_counters; - /* - * All percpu data structures requiring cleanup when a cpu goes offline - * must be allocated before adding this @mp to the cpu-dead handler's - * mount list. - */ - xfs_mount_list_add(mp); - /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { @@ -1547,10 +1567,14 @@ xfs_fs_fill_super( goto out_destroy_inodegc; } - error = xfs_readsb(mp, flags); + error = xchk_mount_stats_alloc(mp); if (error) goto out_free_stats; + error = xfs_readsb(mp, flags); + if (error) + goto out_free_scrub_stats; + error = xfs_finish_flags(mp); if (error) goto out_free_sb; @@ -1728,20 +1752,18 @@ xfs_fs_fill_super( xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); + out_free_scrub_stats: + xchk_mount_stats_free(mp); out_free_stats: free_percpu(mp->m_stats.xs_stats); out_destroy_inodegc: - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); out_destroy_counters: xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); - out_close_devices: - xfs_close_devices(mp); - out_free_names: - sb->s_fs_info = NULL; - xfs_mount_free(mp); + out_shutdown_devices: + xfs_shutdown_devices(mp); return error; out_unmount: @@ -1934,7 +1956,8 @@ xfs_fs_reconfigure( return 0; } -static void xfs_fs_free( +static void +xfs_fs_free( struct fs_context *fc) { struct xfs_mount *mp = fc->s_fs_info; @@ -2003,12 +2026,20 @@ static int xfs_init_fs_context( return 0; } +static void +xfs_kill_sb( + struct super_block *sb) +{ + kill_block_super(sb); + xfs_mount_free(XFS_M(sb)); +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, - .kill_sb = kill_block_super, + .kill_sb = xfs_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("xfs"); @@ -2270,49 +2301,6 @@ xfs_destroy_workqueues(void) destroy_workqueue(xfs_alloc_wq); } -#ifdef CONFIG_HOTPLUG_CPU -static int -xfs_cpu_dead( - unsigned int cpu) -{ - struct xfs_mount *mp, *n; - - spin_lock(&xfs_mount_list_lock); - list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { - spin_unlock(&xfs_mount_list_lock); - xfs_inodegc_cpu_dead(mp, cpu); - xlog_cil_pcp_dead(mp->m_log, cpu); - spin_lock(&xfs_mount_list_lock); - } - spin_unlock(&xfs_mount_list_lock); - return 0; -} - -static int __init -xfs_cpu_hotplug_init(void) -{ - int error; - - error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL, - xfs_cpu_dead); - if (error < 0) - xfs_alert(NULL, -"Failed to initialise CPU hotplug, error %d. XFS is non-functional.", - error); - return error; -} - -static void -xfs_cpu_hotplug_destroy(void) -{ - cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD); -} - -#else /* !CONFIG_HOTPLUG_CPU */ -static inline int xfs_cpu_hotplug_init(void) { return 0; } -static inline void xfs_cpu_hotplug_destroy(void) {} -#endif - STATIC int __init init_xfs_fs(void) { @@ -2329,13 +2317,9 @@ init_xfs_fs(void) xfs_dir_startup(); - error = xfs_cpu_hotplug_init(); - if (error) - goto out; - error = xfs_init_caches(); if (error) - goto out_destroy_hp; + goto out; error = xfs_init_workqueues(); if (error) @@ -2353,10 +2337,12 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; + xfs_debugfs = xfs_debugfs_mkdir("xfs", NULL); + xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); if (!xfs_kset) { error = -ENOMEM; - goto out_sysctl_unregister; + goto out_debugfs_unregister; } xfsstats.xs_kobj.kobject.kset = xfs_kset; @@ -2372,11 +2358,15 @@ init_xfs_fs(void) if (error) goto out_free_stats; + error = xchk_global_stats_setup(xfs_debugfs); + if (error) + goto out_remove_stats_kobj; + #ifdef DEBUG xfs_dbg_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) - goto out_remove_stats_kobj; + goto out_remove_scrub_stats; #endif error = xfs_qm_init(); @@ -2393,14 +2383,17 @@ init_xfs_fs(void) out_remove_dbg_kobj: #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); - out_remove_stats_kobj: + out_remove_scrub_stats: #endif + xchk_global_stats_teardown(); + out_remove_stats_kobj: xfs_sysfs_del(&xfsstats.xs_kobj); out_free_stats: free_percpu(xfsstats.xs_stats); out_kset_unregister: kset_unregister(xfs_kset); - out_sysctl_unregister: + out_debugfs_unregister: + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); out_cleanup_procfs: xfs_cleanup_procfs(); @@ -2410,8 +2403,6 @@ init_xfs_fs(void) xfs_destroy_workqueues(); out_destroy_caches: xfs_destroy_caches(); - out_destroy_hp: - xfs_cpu_hotplug_destroy(); out: return error; } @@ -2424,16 +2415,17 @@ exit_xfs_fs(void) #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); #endif + xchk_global_stats_teardown(); xfs_sysfs_del(&xfsstats.xs_kobj); free_percpu(xfsstats.xs_stats); kset_unregister(xfs_kset); + debugfs_remove(xfs_debugfs); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_caches(); xfs_uuid_table_free(); - xfs_cpu_hotplug_destroy(); } module_init(init_xfs_fs); diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 364e2c2648a8..302e6e5d6c7e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -100,4 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) +struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent); + #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f3cc204bb4bf..3926cf7f2a6e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -22,6 +22,9 @@ * daddr: physical block number in 512b blocks * bbcount: number of blocks in a physical extent, in 512b blocks * + * rtx: physical rt extent number for extent mappings + * rtxcount: number of rt extents in an extent mapping + * * owner: reverse-mapping owner, usually inodes * * fileoff: file offset, in fs blocks @@ -802,36 +805,28 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating); * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(PE_SIZE_PTE); -TRACE_DEFINE_ENUM(PE_SIZE_PMD); -TRACE_DEFINE_ENUM(PE_SIZE_PUD); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); TRACE_EVENT(xfs_filemap_fault, - TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, - bool write_fault), - TP_ARGS(ip, pe_size, write_fault), + TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault), + TP_ARGS(ip, order, write_fault), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(enum page_entry_size, pe_size) + __field(unsigned int, order) __field(bool, write_fault) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->pe_size = pe_size; + __entry->order = order; __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __print_symbolic(__entry->pe_size, - { PE_SIZE_PTE, "PTE" }, - { PE_SIZE_PMD, "PMD" }, - { PE_SIZE_PUD, "PUD" }), + __entry->order, __entry->write_fault) ) @@ -3829,6 +3824,51 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __entry->new_ptr) ); +TRACE_EVENT(xfs_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xfs_inode_reload_unlinked_bucket, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS) +); + DECLARE_EVENT_CLASS(xfs_ag_inode_class, TP_PROTO(struct xfs_inode *ip), TP_ARGS(ip), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8c0bfc9a33b1..305c9d07bf1b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -24,6 +24,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_icache.h" +#include "xfs_rtbitmap.h" struct kmem_cache *xfs_trans_cache; @@ -655,6 +656,10 @@ xfs_trans_unreserve_and_mod_sb( mp->m_sb.sb_agcount += tp->t_agcount_delta; mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + if (tp->t_rextsize_delta) { + mp->m_rtxblklog = log2_if_power2(mp->m_sb.sb_rextsize); + mp->m_rtxblkmask = mask64_if_power2(mp->m_sb.sb_rextsize); + } mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; mp->m_sb.sb_rblocks += tp->t_rblocks_delta; mp->m_sb.sb_rextents += tp->t_rextents_delta; @@ -1196,7 +1201,7 @@ xfs_trans_alloc_inode( retry: error = xfs_trans_alloc(mp, resv, dblocks, - rblocks / mp->m_sb.sb_rextsize, + xfs_extlen_to_rtxlen(mp, rblocks), force ? XFS_TRANS_RESERVE : 0, &tp); if (error) return error; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 43e5c219aaed..987843f84d03 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -46,6 +46,17 @@ xfs_attr_grab_log_assist( if (xfs_sb_version_haslogxattrs(&mp->m_sb)) return 0; + /* + * Check if the filesystem featureset is new enough to set this log + * incompat feature bit. Strictly speaking, the minimum requirement is + * a V5 filesystem for the superblock field, but we'll require rmap + * or reflink to avoid having to deal with really old kernels. + */ + if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) { + error = -EOPNOTSUPP; + goto drop_incompat; + } + /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); @@ -175,7 +186,7 @@ static const struct xattr_handler xfs_xattr_security_handler = { .set = xfs_xattr_set, }; -const struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler * const xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h index 2b09133b1b9b..cec766cad26c 100644 --- a/fs/xfs/xfs_xattr.h +++ b/fs/xfs/xfs_xattr.h @@ -8,6 +8,6 @@ int xfs_attr_change(struct xfs_da_args *args); -extern const struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler * const xfs_xattr_handlers[]; #endif /* __XFS_XATTR_H__ */ diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 789cfb74c146..b2c9b35df8f7 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = { .read_folio = zonefs_read_folio, .readahead = zonefs_readahead, .writepages = zonefs_writepages, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .migrate_folio = filemap_migrate_folio, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 9350221abfc5..e6a75401677d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -658,7 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir, inode->i_ino = ino; inode->i_mode = z->z_mode; - inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(dir)))); inode->i_uid = z->z_uid; inode->i_gid = z->z_gid; inode->i_size = z->z_wpoffset; @@ -694,7 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, inode->i_ino = ino; inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; - inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime; + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(root)))); inode->i_private = &sbi->s_zgroup[ztype]; set_nlink(inode, 2); @@ -1317,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) inode->i_ino = bdev_nr_zones(sb->s_bdev); inode->i_mode = S_IFDIR | 0555; - inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); + simple_inode_init_ts(inode); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &zonefs_dir_operations; inode->i_size = 2; |